1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_Inode" 586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_Inode" 779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 const PetscInt *r,*c,*rout,*cout; 785 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 786 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 821 for(j=0; j<nz-1; j+=2){ 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 } 829 if(j == nz-1){ 830 tmp0 = tmps[*vi++]; 831 sum1 -= *v1++ *tmp0; 832 } 833 tmp[row ++]=sum1; 834 break; 835 case 2: 836 sum1 = b[*r++]; 837 sum2 = b[*r++]; 838 v2 = aa + ai[row+1]; 839 840 for(j=0; j<nz-1; j+=2){ 841 i0 = vi[0]; 842 i1 = vi[1]; 843 vi +=2; 844 tmp0 = tmps[i0]; 845 tmp1 = tmps[i1]; 846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 848 } 849 if(j == nz-1){ 850 tmp0 = tmps[*vi++]; 851 sum1 -= *v1++ *tmp0; 852 sum2 -= *v2++ *tmp0; 853 } 854 sum2 -= *v2++ * sum1; 855 tmp[row ++]=sum1; 856 tmp[row ++]=sum2; 857 break; 858 case 3: 859 sum1 = b[*r++]; 860 sum2 = b[*r++]; 861 sum3 = b[*r++]; 862 v2 = aa + ai[row+1]; 863 v3 = aa + ai[row+2]; 864 865 for (j=0; j<nz-1; j+=2){ 866 i0 = vi[0]; 867 i1 = vi[1]; 868 vi +=2; 869 tmp0 = tmps[i0]; 870 tmp1 = tmps[i1]; 871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 874 } 875 if (j == nz-1){ 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ *tmp0; 878 sum2 -= *v2++ *tmp0; 879 sum3 -= *v3++ *tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 tmp[row ++]=sum1; 885 tmp[row ++]=sum2; 886 tmp[row ++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2){ 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1){ 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row ++]=sum1; 924 tmp[row ++]=sum2; 925 tmp[row ++]=sum3; 926 tmp[row ++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2){ 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1){ 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row ++]=sum1; 972 tmp[row ++]=sum2; 973 tmp[row ++]=sum3; 974 tmp[row ++]=sum4; 975 tmp[row ++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz){ /* Each loop in 'case' is unrolled */ 989 case 1 : 990 sum1 = tmp[row]; 991 992 for(j=nz ; j>1; j-=2){ 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1){ 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2 : 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz ; j>1; j-=2){ 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1){ 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3 : 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz ; j>1; j-=2){ 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1){ 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4 : 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz ; j>1; j-=2){ 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1){ 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5 : 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz ; j>1; j-=2){ 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1){ 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 /* 1173 Makes a longer coloring[] array and calls the usual code with that 1174 */ 1175 #undef __FUNCT__ 1176 #define __FUNCT__ "MatColoringPatch_Inode" 1177 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1178 { 1179 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1180 PetscErrorCode ierr; 1181 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1182 PetscInt *colorused,i; 1183 ISColoringValue *newcolor; 1184 1185 PetscFunctionBegin; 1186 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1187 /* loop over inodes, marking a color for each column*/ 1188 row = 0; 1189 for (i=0; i<m; i++){ 1190 for (j=0; j<ns[i]; j++) { 1191 newcolor[row++] = coloring[i] + j*ncolors; 1192 } 1193 } 1194 1195 /* eliminate unneeded colors */ 1196 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1197 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1198 for (i=0; i<n; i++) { 1199 colorused[newcolor[i]] = 1; 1200 } 1201 1202 for (i=1; i<5*ncolors; i++) { 1203 colorused[i] += colorused[i-1]; 1204 } 1205 ncolors = colorused[5*ncolors-1]; 1206 for (i=0; i<n; i++) { 1207 newcolor[i] = colorused[newcolor[i]]-1; 1208 } 1209 ierr = PetscFree(colorused);CHKERRQ(ierr); 1210 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1211 ierr = PetscFree(coloring);CHKERRQ(ierr); 1212 PetscFunctionReturn(0); 1213 } 1214 1215 #include "../src/mat/blockinvert.h" 1216 1217 #undef __FUNCT__ 1218 #define __FUNCT__ "MatRelax_Inode" 1219 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1220 { 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1222 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1223 MatScalar *ibdiag,*bdiag; 1224 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1225 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1226 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1227 PetscErrorCode ierr; 1228 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1229 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1230 1231 PetscFunctionBegin; 1232 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1233 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1234 if (its > 1) { 1235 /* switch to non-inode version */ 1236 ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1237 PetscFunctionReturn(0); 1238 } 1239 1240 if (!a->inode.ibdiagvalid) { 1241 if (!a->inode.ibdiag) { 1242 /* calculate space needed for diagonal blocks */ 1243 for (i=0; i<m; i++) { 1244 cnt += sizes[i]*sizes[i]; 1245 } 1246 a->inode.bdiagsize = cnt; 1247 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1248 } 1249 1250 /* copy over the diagonal blocks and invert them */ 1251 ibdiag = a->inode.ibdiag; 1252 bdiag = a->inode.bdiag; 1253 cnt = 0; 1254 for (i=0, row = 0; i<m; i++) { 1255 for (j=0; j<sizes[i]; j++) { 1256 for (k=0; k<sizes[i]; k++) { 1257 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1258 } 1259 } 1260 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1261 1262 switch(sizes[i]) { 1263 case 1: 1264 /* Create matrix data structure */ 1265 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1266 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1267 break; 1268 case 2: 1269 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1270 break; 1271 case 3: 1272 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1273 break; 1274 case 4: 1275 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1276 break; 1277 case 5: 1278 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1279 break; 1280 default: 1281 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1282 } 1283 cnt += sizes[i]*sizes[i]; 1284 row += sizes[i]; 1285 } 1286 a->inode.ibdiagvalid = PETSC_TRUE; 1287 } 1288 ibdiag = a->inode.ibdiag; 1289 bdiag = a->inode.bdiag; 1290 1291 1292 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1293 if (flag & SOR_ZERO_INITIAL_GUESS) { 1294 PetscScalar *b; 1295 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1296 if (xx != bb) { 1297 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1298 } else { 1299 b = x; 1300 } 1301 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1302 1303 for (i=0, row=0; i<m; i++) { 1304 sz = diag[row] - ii[row]; 1305 v1 = a->a + ii[row]; 1306 idx = a->j + ii[row]; 1307 1308 /* see comments for MatMult_Inode() for how this is coded */ 1309 switch (sizes[i]){ 1310 case 1: 1311 1312 sum1 = b[row]; 1313 for(n = 0; n<sz-1; n+=2) { 1314 i1 = idx[0]; 1315 i2 = idx[1]; 1316 idx += 2; 1317 tmp0 = x[i1]; 1318 tmp1 = x[i2]; 1319 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1320 } 1321 1322 if (n == sz-1){ 1323 tmp0 = x[*idx]; 1324 sum1 -= *v1 * tmp0; 1325 } 1326 x[row++] = sum1*(*ibdiag++); 1327 break; 1328 case 2: 1329 v2 = a->a + ii[row+1]; 1330 sum1 = b[row]; 1331 sum2 = b[row+1]; 1332 for(n = 0; n<sz-1; n+=2) { 1333 i1 = idx[0]; 1334 i2 = idx[1]; 1335 idx += 2; 1336 tmp0 = x[i1]; 1337 tmp1 = x[i2]; 1338 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1339 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1340 } 1341 1342 if (n == sz-1){ 1343 tmp0 = x[*idx]; 1344 sum1 -= v1[0] * tmp0; 1345 sum2 -= v2[0] * tmp0; 1346 } 1347 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1348 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1349 ibdiag += 4; 1350 break; 1351 case 3: 1352 v2 = a->a + ii[row+1]; 1353 v3 = a->a + ii[row+2]; 1354 sum1 = b[row]; 1355 sum2 = b[row+1]; 1356 sum3 = b[row+2]; 1357 for(n = 0; n<sz-1; n+=2) { 1358 i1 = idx[0]; 1359 i2 = idx[1]; 1360 idx += 2; 1361 tmp0 = x[i1]; 1362 tmp1 = x[i2]; 1363 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1364 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1365 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1366 } 1367 1368 if (n == sz-1){ 1369 tmp0 = x[*idx]; 1370 sum1 -= v1[0] * tmp0; 1371 sum2 -= v2[0] * tmp0; 1372 sum3 -= v3[0] * tmp0; 1373 } 1374 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1375 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1376 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1377 ibdiag += 9; 1378 break; 1379 case 4: 1380 v2 = a->a + ii[row+1]; 1381 v3 = a->a + ii[row+2]; 1382 v4 = a->a + ii[row+3]; 1383 sum1 = b[row]; 1384 sum2 = b[row+1]; 1385 sum3 = b[row+2]; 1386 sum4 = b[row+3]; 1387 for(n = 0; n<sz-1; n+=2) { 1388 i1 = idx[0]; 1389 i2 = idx[1]; 1390 idx += 2; 1391 tmp0 = x[i1]; 1392 tmp1 = x[i2]; 1393 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1394 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1395 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1396 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1397 } 1398 1399 if (n == sz-1){ 1400 tmp0 = x[*idx]; 1401 sum1 -= v1[0] * tmp0; 1402 sum2 -= v2[0] * tmp0; 1403 sum3 -= v3[0] * tmp0; 1404 sum4 -= v4[0] * tmp0; 1405 } 1406 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1407 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1408 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1409 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1410 ibdiag += 16; 1411 break; 1412 case 5: 1413 v2 = a->a + ii[row+1]; 1414 v3 = a->a + ii[row+2]; 1415 v4 = a->a + ii[row+3]; 1416 v5 = a->a + ii[row+4]; 1417 sum1 = b[row]; 1418 sum2 = b[row+1]; 1419 sum3 = b[row+2]; 1420 sum4 = b[row+3]; 1421 sum5 = b[row+4]; 1422 for(n = 0; n<sz-1; n+=2) { 1423 i1 = idx[0]; 1424 i2 = idx[1]; 1425 idx += 2; 1426 tmp0 = x[i1]; 1427 tmp1 = x[i2]; 1428 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1429 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1430 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1431 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1432 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1433 } 1434 1435 if (n == sz-1){ 1436 tmp0 = x[*idx]; 1437 sum1 -= v1[0] * tmp0; 1438 sum2 -= v2[0] * tmp0; 1439 sum3 -= v3[0] * tmp0; 1440 sum4 -= v4[0] * tmp0; 1441 sum5 -= v5[0] * tmp0; 1442 } 1443 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1444 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1445 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1446 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1447 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1448 ibdiag += 25; 1449 break; 1450 default: 1451 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1452 } 1453 } 1454 1455 xb = x; 1456 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1457 } else xb = b; 1458 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1459 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1460 cnt = 0; 1461 for (i=0, row=0; i<m; i++) { 1462 1463 switch (sizes[i]){ 1464 case 1: 1465 x[row++] *= bdiag[cnt++]; 1466 break; 1467 case 2: 1468 x1 = x[row]; x2 = x[row+1]; 1469 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1470 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1471 x[row++] = tmp1; 1472 x[row++] = tmp2; 1473 cnt += 4; 1474 break; 1475 case 3: 1476 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1477 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1478 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1479 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1480 x[row++] = tmp1; 1481 x[row++] = tmp2; 1482 x[row++] = tmp3; 1483 cnt += 9; 1484 break; 1485 case 4: 1486 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1487 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1488 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1489 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1490 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1491 x[row++] = tmp1; 1492 x[row++] = tmp2; 1493 x[row++] = tmp3; 1494 x[row++] = tmp4; 1495 cnt += 16; 1496 break; 1497 case 5: 1498 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1499 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1500 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1501 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1502 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1503 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1504 x[row++] = tmp1; 1505 x[row++] = tmp2; 1506 x[row++] = tmp3; 1507 x[row++] = tmp4; 1508 x[row++] = tmp5; 1509 cnt += 25; 1510 break; 1511 default: 1512 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1513 } 1514 } 1515 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1516 } 1517 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1518 1519 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1520 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1521 ibdiag -= sizes[i]*sizes[i]; 1522 sz = ii[row+1] - diag[row] - 1; 1523 v1 = a->a + diag[row] + 1; 1524 idx = a->j + diag[row] + 1; 1525 1526 /* see comments for MatMult_Inode() for how this is coded */ 1527 switch (sizes[i]){ 1528 case 1: 1529 1530 sum1 = xb[row]; 1531 for(n = 0; n<sz-1; n+=2) { 1532 i1 = idx[0]; 1533 i2 = idx[1]; 1534 idx += 2; 1535 tmp0 = x[i1]; 1536 tmp1 = x[i2]; 1537 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1538 } 1539 1540 if (n == sz-1){ 1541 tmp0 = x[*idx]; 1542 sum1 -= *v1*tmp0; 1543 } 1544 x[row--] = sum1*(*ibdiag); 1545 break; 1546 1547 case 2: 1548 1549 sum1 = xb[row]; 1550 sum2 = xb[row-1]; 1551 /* note that sum1 is associated with the second of the two rows */ 1552 v2 = a->a + diag[row-1] + 2; 1553 for(n = 0; n<sz-1; n+=2) { 1554 i1 = idx[0]; 1555 i2 = idx[1]; 1556 idx += 2; 1557 tmp0 = x[i1]; 1558 tmp1 = x[i2]; 1559 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1560 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1561 } 1562 1563 if (n == sz-1){ 1564 tmp0 = x[*idx]; 1565 sum1 -= *v1*tmp0; 1566 sum2 -= *v2*tmp0; 1567 } 1568 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1569 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1570 break; 1571 case 3: 1572 1573 sum1 = xb[row]; 1574 sum2 = xb[row-1]; 1575 sum3 = xb[row-2]; 1576 v2 = a->a + diag[row-1] + 2; 1577 v3 = a->a + diag[row-2] + 3; 1578 for(n = 0; n<sz-1; n+=2) { 1579 i1 = idx[0]; 1580 i2 = idx[1]; 1581 idx += 2; 1582 tmp0 = x[i1]; 1583 tmp1 = x[i2]; 1584 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1585 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1586 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1587 } 1588 1589 if (n == sz-1){ 1590 tmp0 = x[*idx]; 1591 sum1 -= *v1*tmp0; 1592 sum2 -= *v2*tmp0; 1593 sum3 -= *v3*tmp0; 1594 } 1595 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1596 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1597 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1598 break; 1599 case 4: 1600 1601 sum1 = xb[row]; 1602 sum2 = xb[row-1]; 1603 sum3 = xb[row-2]; 1604 sum4 = xb[row-3]; 1605 v2 = a->a + diag[row-1] + 2; 1606 v3 = a->a + diag[row-2] + 3; 1607 v4 = a->a + diag[row-3] + 4; 1608 for(n = 0; n<sz-1; n+=2) { 1609 i1 = idx[0]; 1610 i2 = idx[1]; 1611 idx += 2; 1612 tmp0 = x[i1]; 1613 tmp1 = x[i2]; 1614 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1615 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1616 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1617 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1618 } 1619 1620 if (n == sz-1){ 1621 tmp0 = x[*idx]; 1622 sum1 -= *v1*tmp0; 1623 sum2 -= *v2*tmp0; 1624 sum3 -= *v3*tmp0; 1625 sum4 -= *v4*tmp0; 1626 } 1627 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 1628 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 1629 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 1630 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 1631 break; 1632 case 5: 1633 1634 sum1 = xb[row]; 1635 sum2 = xb[row-1]; 1636 sum3 = xb[row-2]; 1637 sum4 = xb[row-3]; 1638 sum5 = xb[row-4]; 1639 v2 = a->a + diag[row-1] + 2; 1640 v3 = a->a + diag[row-2] + 3; 1641 v4 = a->a + diag[row-3] + 4; 1642 v5 = a->a + diag[row-4] + 5; 1643 for(n = 0; n<sz-1; n+=2) { 1644 i1 = idx[0]; 1645 i2 = idx[1]; 1646 idx += 2; 1647 tmp0 = x[i1]; 1648 tmp1 = x[i2]; 1649 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1650 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1651 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1652 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1653 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1654 } 1655 1656 if (n == sz-1){ 1657 tmp0 = x[*idx]; 1658 sum1 -= *v1*tmp0; 1659 sum2 -= *v2*tmp0; 1660 sum3 -= *v3*tmp0; 1661 sum4 -= *v4*tmp0; 1662 sum5 -= *v5*tmp0; 1663 } 1664 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 1665 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 1666 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 1667 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 1668 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 1669 break; 1670 default: 1671 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1672 } 1673 } 1674 1675 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1676 } 1677 its--; 1678 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1679 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 1680 } 1681 if (flag & SOR_EISENSTAT) { 1682 const PetscScalar *b; 1683 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1684 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1685 MatScalar *t = a->inode.ssor_work; 1686 /* 1687 Apply (U + D)^-1 where D is now the block diagonal 1688 */ 1689 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1690 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1691 ibdiag -= sizes[i]*sizes[i]; 1692 sz = ii[row+1] - diag[row] - 1; 1693 v1 = a->a + diag[row] + 1; 1694 idx = a->j + diag[row] + 1; 1695 CHKMEMQ; 1696 /* see comments for MatMult_Inode() for how this is coded */ 1697 switch (sizes[i]){ 1698 case 1: 1699 1700 sum1 = b[row]; 1701 for(n = 0; n<sz-1; n+=2) { 1702 i1 = idx[0]; 1703 i2 = idx[1]; 1704 idx += 2; 1705 tmp0 = x[i1]; 1706 tmp1 = x[i2]; 1707 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1708 } 1709 1710 if (n == sz-1){ 1711 tmp0 = x[*idx]; 1712 sum1 -= *v1*tmp0; 1713 } 1714 x[row] = sum1*(*ibdiag);row--; 1715 break; 1716 1717 case 2: 1718 1719 sum1 = b[row]; 1720 sum2 = b[row-1]; 1721 /* note that sum1 is associated with the second of the two rows */ 1722 v2 = a->a + diag[row-1] + 2; 1723 for(n = 0; n<sz-1; n+=2) { 1724 i1 = idx[0]; 1725 i2 = idx[1]; 1726 idx += 2; 1727 tmp0 = x[i1]; 1728 tmp1 = x[i2]; 1729 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1730 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1731 } 1732 1733 if (n == sz-1){ 1734 tmp0 = x[*idx]; 1735 sum1 -= *v1*tmp0; 1736 sum2 -= *v2*tmp0; 1737 } 1738 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1739 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1740 row -= 2; 1741 break; 1742 case 3: 1743 1744 sum1 = b[row]; 1745 sum2 = b[row-1]; 1746 sum3 = b[row-2]; 1747 v2 = a->a + diag[row-1] + 2; 1748 v3 = a->a + diag[row-2] + 3; 1749 for(n = 0; n<sz-1; n+=2) { 1750 i1 = idx[0]; 1751 i2 = idx[1]; 1752 idx += 2; 1753 tmp0 = x[i1]; 1754 tmp1 = x[i2]; 1755 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1756 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1757 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1758 } 1759 1760 if (n == sz-1){ 1761 tmp0 = x[*idx]; 1762 sum1 -= *v1*tmp0; 1763 sum2 -= *v2*tmp0; 1764 sum3 -= *v3*tmp0; 1765 } 1766 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1767 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1768 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1769 row -= 3; 1770 break; 1771 case 4: 1772 1773 sum1 = b[row]; 1774 sum2 = b[row-1]; 1775 sum3 = b[row-2]; 1776 sum4 = b[row-3]; 1777 v2 = a->a + diag[row-1] + 2; 1778 v3 = a->a + diag[row-2] + 3; 1779 v4 = a->a + diag[row-3] + 4; 1780 for(n = 0; n<sz-1; n+=2) { 1781 i1 = idx[0]; 1782 i2 = idx[1]; 1783 idx += 2; 1784 tmp0 = x[i1]; 1785 tmp1 = x[i2]; 1786 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1787 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1788 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1789 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1790 } 1791 1792 if (n == sz-1){ 1793 tmp0 = x[*idx]; 1794 sum1 -= *v1*tmp0; 1795 sum2 -= *v2*tmp0; 1796 sum3 -= *v3*tmp0; 1797 sum4 -= *v4*tmp0; 1798 } 1799 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 1800 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 1801 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 1802 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 1803 row -= 4; 1804 break; 1805 case 5: 1806 1807 sum1 = b[row]; 1808 sum2 = b[row-1]; 1809 sum3 = b[row-2]; 1810 sum4 = b[row-3]; 1811 sum5 = b[row-4]; 1812 v2 = a->a + diag[row-1] + 2; 1813 v3 = a->a + diag[row-2] + 3; 1814 v4 = a->a + diag[row-3] + 4; 1815 v5 = a->a + diag[row-4] + 5; 1816 for(n = 0; n<sz-1; n+=2) { 1817 i1 = idx[0]; 1818 i2 = idx[1]; 1819 idx += 2; 1820 tmp0 = x[i1]; 1821 tmp1 = x[i2]; 1822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1824 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1825 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1826 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1827 } 1828 1829 if (n == sz-1){ 1830 tmp0 = x[*idx]; 1831 sum1 -= *v1*tmp0; 1832 sum2 -= *v2*tmp0; 1833 sum3 -= *v3*tmp0; 1834 sum4 -= *v4*tmp0; 1835 sum5 -= *v5*tmp0; 1836 } 1837 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 1838 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 1839 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 1840 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 1841 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 1842 row -= 5; 1843 break; 1844 default: 1845 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1846 } 1847 CHKMEMQ; 1848 } 1849 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1850 1851 /* 1852 t = b - D x where D is the block diagonal 1853 */ 1854 cnt = 0; 1855 for (i=0, row=0; i<m; i++) { 1856 CHKMEMQ; 1857 switch (sizes[i]){ 1858 case 1: 1859 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 1860 break; 1861 case 2: 1862 x1 = x[row]; x2 = x[row+1]; 1863 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1864 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1865 t[row] = b[row] - tmp1; 1866 t[row+1] = b[row+1] - tmp2; row += 2; 1867 cnt += 4; 1868 break; 1869 case 3: 1870 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1871 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1872 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1873 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1874 t[row] = b[row] - tmp1; 1875 t[row+1] = b[row+1] - tmp2; 1876 t[row+2] = b[row+2] - tmp3; row += 3; 1877 cnt += 9; 1878 break; 1879 case 4: 1880 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1881 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1882 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1883 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1884 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1885 t[row] = b[row] - tmp1; 1886 t[row+1] = b[row+1] - tmp2; 1887 t[row+2] = b[row+2] - tmp3; 1888 t[row+3] = b[row+3] - tmp4; row += 4; 1889 cnt += 16; 1890 break; 1891 case 5: 1892 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1893 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1894 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1895 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1896 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1897 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1898 t[row] = b[row] - tmp1; 1899 t[row+1] = b[row+1] - tmp2; 1900 t[row+2] = b[row+2] - tmp3; 1901 t[row+3] = b[row+3] - tmp4; 1902 t[row+4] = b[row+4] - tmp5;row += 5; 1903 cnt += 25; 1904 break; 1905 default: 1906 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1907 } 1908 CHKMEMQ; 1909 } 1910 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1911 1912 1913 1914 /* 1915 Apply (L + D)^-1 where D is the block diagonal 1916 */ 1917 for (i=0, row=0; i<m; i++) { 1918 sz = diag[row] - ii[row]; 1919 v1 = a->a + ii[row]; 1920 idx = a->j + ii[row]; 1921 CHKMEMQ; 1922 /* see comments for MatMult_Inode() for how this is coded */ 1923 switch (sizes[i]){ 1924 case 1: 1925 1926 sum1 = t[row]; 1927 for(n = 0; n<sz-1; n+=2) { 1928 i1 = idx[0]; 1929 i2 = idx[1]; 1930 idx += 2; 1931 tmp0 = t[i1]; 1932 tmp1 = t[i2]; 1933 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1934 } 1935 1936 if (n == sz-1){ 1937 tmp0 = t[*idx]; 1938 sum1 -= *v1 * tmp0; 1939 } 1940 x[row] += t[row] = sum1*(*ibdiag++); row++; 1941 break; 1942 case 2: 1943 v2 = a->a + ii[row+1]; 1944 sum1 = t[row]; 1945 sum2 = t[row+1]; 1946 for(n = 0; n<sz-1; n+=2) { 1947 i1 = idx[0]; 1948 i2 = idx[1]; 1949 idx += 2; 1950 tmp0 = t[i1]; 1951 tmp1 = t[i2]; 1952 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1953 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1954 } 1955 1956 if (n == sz-1){ 1957 tmp0 = t[*idx]; 1958 sum1 -= v1[0] * tmp0; 1959 sum2 -= v2[0] * tmp0; 1960 } 1961 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1962 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1963 ibdiag += 4; row += 2; 1964 break; 1965 case 3: 1966 v2 = a->a + ii[row+1]; 1967 v3 = a->a + ii[row+2]; 1968 sum1 = t[row]; 1969 sum2 = t[row+1]; 1970 sum3 = t[row+2]; 1971 for(n = 0; n<sz-1; n+=2) { 1972 i1 = idx[0]; 1973 i2 = idx[1]; 1974 idx += 2; 1975 tmp0 = t[i1]; 1976 tmp1 = t[i2]; 1977 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1978 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1979 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1980 } 1981 1982 if (n == sz-1){ 1983 tmp0 = t[*idx]; 1984 sum1 -= v1[0] * tmp0; 1985 sum2 -= v2[0] * tmp0; 1986 sum3 -= v3[0] * tmp0; 1987 } 1988 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1989 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1990 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1991 ibdiag += 9; row += 3; 1992 break; 1993 case 4: 1994 v2 = a->a + ii[row+1]; 1995 v3 = a->a + ii[row+2]; 1996 v4 = a->a + ii[row+3]; 1997 sum1 = t[row]; 1998 sum2 = t[row+1]; 1999 sum3 = t[row+2]; 2000 sum4 = t[row+3]; 2001 for(n = 0; n<sz-1; n+=2) { 2002 i1 = idx[0]; 2003 i2 = idx[1]; 2004 idx += 2; 2005 tmp0 = t[i1]; 2006 tmp1 = t[i2]; 2007 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2008 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2009 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2010 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2011 } 2012 2013 if (n == sz-1){ 2014 tmp0 = t[*idx]; 2015 sum1 -= v1[0] * tmp0; 2016 sum2 -= v2[0] * tmp0; 2017 sum3 -= v3[0] * tmp0; 2018 sum4 -= v4[0] * tmp0; 2019 } 2020 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2021 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2022 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2023 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2024 ibdiag += 16; row += 4; 2025 break; 2026 case 5: 2027 v2 = a->a + ii[row+1]; 2028 v3 = a->a + ii[row+2]; 2029 v4 = a->a + ii[row+3]; 2030 v5 = a->a + ii[row+4]; 2031 sum1 = t[row]; 2032 sum2 = t[row+1]; 2033 sum3 = t[row+2]; 2034 sum4 = t[row+3]; 2035 sum5 = t[row+4]; 2036 for(n = 0; n<sz-1; n+=2) { 2037 i1 = idx[0]; 2038 i2 = idx[1]; 2039 idx += 2; 2040 tmp0 = t[i1]; 2041 tmp1 = t[i2]; 2042 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2043 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2044 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2045 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2046 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2047 } 2048 2049 if (n == sz-1){ 2050 tmp0 = t[*idx]; 2051 sum1 -= v1[0] * tmp0; 2052 sum2 -= v2[0] * tmp0; 2053 sum3 -= v3[0] * tmp0; 2054 sum4 -= v4[0] * tmp0; 2055 sum5 -= v5[0] * tmp0; 2056 } 2057 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2058 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2059 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2060 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2061 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2062 ibdiag += 25; row += 5; 2063 break; 2064 default: 2065 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2066 } 2067 CHKMEMQ; 2068 } 2069 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2070 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2071 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2072 } 2073 PetscFunctionReturn(0); 2074 } 2075 2076 #undef __FUNCT__ 2077 #define __FUNCT__ "MatMultDiagonalBlock_Inode" 2078 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx) 2079 { 2080 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2081 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2082 const MatScalar *bdiag = a->inode.bdiag; 2083 const PetscScalar *b; 2084 PetscErrorCode ierr; 2085 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2086 const PetscInt *sizes = a->inode.size; 2087 2088 PetscFunctionBegin; 2089 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2090 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2091 cnt = 0; 2092 for (i=0, row=0; i<m; i++) { 2093 switch (sizes[i]){ 2094 case 1: 2095 x[row] = b[row]*bdiag[cnt++];row++; 2096 break; 2097 case 2: 2098 x1 = b[row]; x2 = b[row+1]; 2099 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2100 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2101 x[row++] = tmp1; 2102 x[row++] = tmp2; 2103 cnt += 4; 2104 break; 2105 case 3: 2106 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2107 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2108 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2109 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2110 x[row++] = tmp1; 2111 x[row++] = tmp2; 2112 x[row++] = tmp3; 2113 cnt += 9; 2114 break; 2115 case 4: 2116 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2117 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2118 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2119 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2120 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2121 x[row++] = tmp1; 2122 x[row++] = tmp2; 2123 x[row++] = tmp3; 2124 x[row++] = tmp4; 2125 cnt += 16; 2126 break; 2127 case 5: 2128 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2129 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2130 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2131 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2132 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2133 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2134 x[row++] = tmp1; 2135 x[row++] = tmp2; 2136 x[row++] = tmp3; 2137 x[row++] = tmp4; 2138 x[row++] = tmp5; 2139 cnt += 25; 2140 break; 2141 default: 2142 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2143 } 2144 } 2145 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2147 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2148 PetscFunctionReturn(0); 2149 } 2150 2151 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec); 2152 /* 2153 samestructure indicates that the matrix has not changed its nonzero structure so we 2154 do not need to recompute the inodes 2155 */ 2156 #undef __FUNCT__ 2157 #define __FUNCT__ "Mat_CheckInode" 2158 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2159 { 2160 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2161 PetscErrorCode ierr; 2162 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2163 PetscTruth flag; 2164 2165 PetscFunctionBegin; 2166 if (!a->inode.use) PetscFunctionReturn(0); 2167 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2168 2169 2170 m = A->rmap->n; 2171 if (a->inode.size) {ns = a->inode.size;} 2172 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2173 2174 i = 0; 2175 node_count = 0; 2176 idx = a->j; 2177 ii = a->i; 2178 while (i < m){ /* For each row */ 2179 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2180 /* Limits the number of elements in a node to 'a->inode.limit' */ 2181 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2182 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2183 if (nzy != nzx) break; 2184 idy += nzx; /* Same nonzero pattern */ 2185 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2186 if (!flag) break; 2187 } 2188 ns[node_count++] = blk_size; 2189 idx += blk_size*nzx; 2190 i = j; 2191 } 2192 /* If not enough inodes found,, do not use inode version of the routines */ 2193 if (!a->inode.size && m && node_count > .9*m) { 2194 ierr = PetscFree(ns);CHKERRQ(ierr); 2195 a->inode.node_count = 0; 2196 a->inode.size = PETSC_NULL; 2197 a->inode.use = PETSC_FALSE; 2198 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2199 } else { 2200 A->ops->mult = MatMult_Inode; 2201 A->ops->relax = MatRelax_Inode; 2202 A->ops->multadd = MatMultAdd_Inode; 2203 A->ops->getrowij = MatGetRowIJ_Inode; 2204 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2205 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2206 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2207 A->ops->coloringpatch = MatColoringPatch_Inode; 2208 A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode; 2209 a->inode.node_count = node_count; 2210 a->inode.size = ns; 2211 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2212 } 2213 PetscFunctionReturn(0); 2214 } 2215 2216 /* 2217 This is really ugly. if inodes are used this replaces the 2218 permutations with ones that correspond to rows/cols of the matrix 2219 rather then inode blocks 2220 */ 2221 #undef __FUNCT__ 2222 #define __FUNCT__ "MatInodeAdjustForInodes" 2223 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2224 { 2225 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2226 2227 PetscFunctionBegin; 2228 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2229 if (f) { 2230 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2231 } 2232 PetscFunctionReturn(0); 2233 } 2234 2235 EXTERN_C_BEGIN 2236 #undef __FUNCT__ 2237 #define __FUNCT__ "MatAdjustForInodes_Inode" 2238 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2239 { 2240 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2241 PetscErrorCode ierr; 2242 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2243 const PetscInt *ridx,*cidx; 2244 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2245 PetscInt nslim_col,*ns_col; 2246 IS ris = *rperm,cis = *cperm; 2247 2248 PetscFunctionBegin; 2249 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2250 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2251 2252 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2253 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2254 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2255 permc = permr + m; 2256 2257 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2258 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2259 2260 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2261 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2262 2263 /* Construct the permutations for rows*/ 2264 for (i=0,row = 0; i<nslim_row; ++i){ 2265 indx = ridx[i]; 2266 start_val = tns[indx]; 2267 end_val = tns[indx + 1]; 2268 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2269 } 2270 2271 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2272 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2273 2274 /* Construct permutations for columns */ 2275 for (i=0,col=0; i<nslim_col; ++i){ 2276 indx = cidx[i]; 2277 start_val = tns[indx]; 2278 end_val = tns[indx + 1]; 2279 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2280 } 2281 2282 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2283 ISSetPermutation(*rperm); 2284 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2285 ISSetPermutation(*cperm); 2286 2287 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2288 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2289 2290 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2291 ierr = PetscFree(permr);CHKERRQ(ierr); 2292 ierr = ISDestroy(cis);CHKERRQ(ierr); 2293 ierr = ISDestroy(ris);CHKERRQ(ierr); 2294 ierr = PetscFree(tns);CHKERRQ(ierr); 2295 PetscFunctionReturn(0); 2296 } 2297 EXTERN_C_END 2298 2299 #undef __FUNCT__ 2300 #define __FUNCT__ "MatInodeGetInodeSizes" 2301 /*@C 2302 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2303 2304 Collective on Mat 2305 2306 Input Parameter: 2307 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2308 2309 Output Parameter: 2310 + node_count - no of inodes present in the matrix. 2311 . sizes - an array of size node_count,with sizes of each inode. 2312 - limit - the max size used to generate the inodes. 2313 2314 Level: advanced 2315 2316 Notes: This routine returns some internal storage information 2317 of the matrix, it is intended to be used by advanced users. 2318 It should be called after the matrix is assembled. 2319 The contents of the sizes[] array should not be changed. 2320 PETSC_NULL may be passed for information not requested. 2321 2322 .keywords: matrix, seqaij, get, inode 2323 2324 .seealso: MatGetInfo() 2325 @*/ 2326 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2327 { 2328 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2329 2330 PetscFunctionBegin; 2331 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2332 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2333 if (f) { 2334 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2335 } 2336 PetscFunctionReturn(0); 2337 } 2338 2339 EXTERN_C_BEGIN 2340 #undef __FUNCT__ 2341 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2342 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2343 { 2344 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2345 2346 PetscFunctionBegin; 2347 if (node_count) *node_count = a->inode.node_count; 2348 if (sizes) *sizes = a->inode.size; 2349 if (limit) *limit = a->inode.limit; 2350 PetscFunctionReturn(0); 2351 } 2352 EXTERN_C_END 2353