1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 779 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 const PetscInt *r,*c,*rout,*cout; 785 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 786 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 for(j=0; j<nz-1; j+=2){ 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if(j == nz-1){ 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row ++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for(j=0; j<nz-1; j+=2){ 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if(j == nz-1){ 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ * sum1; 854 tmp[row ++]=sum1; 855 tmp[row ++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2){ 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1){ 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 tmp[row ++]=sum1; 884 tmp[row ++]=sum2; 885 tmp[row ++]=sum3; 886 break; 887 888 case 4: 889 sum1 = b[*r++]; 890 sum2 = b[*r++]; 891 sum3 = b[*r++]; 892 sum4 = b[*r++]; 893 v2 = aa + ai[row+1]; 894 v3 = aa + ai[row+2]; 895 v4 = aa + ai[row+3]; 896 897 for (j=0; j<nz-1; j+=2){ 898 i0 = vi[0]; 899 i1 = vi[1]; 900 vi +=2; 901 tmp0 = tmps[i0]; 902 tmp1 = tmps[i1]; 903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 906 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 907 } 908 if (j == nz-1){ 909 tmp0 = tmps[*vi++]; 910 sum1 -= *v1++ *tmp0; 911 sum2 -= *v2++ *tmp0; 912 sum3 -= *v3++ *tmp0; 913 sum4 -= *v4++ *tmp0; 914 } 915 sum2 -= *v2++ * sum1; 916 sum3 -= *v3++ * sum1; 917 sum4 -= *v4++ * sum1; 918 sum3 -= *v3++ * sum2; 919 sum4 -= *v4++ * sum2; 920 sum4 -= *v4++ * sum3; 921 922 tmp[row ++]=sum1; 923 tmp[row ++]=sum2; 924 tmp[row ++]=sum3; 925 tmp[row ++]=sum4; 926 break; 927 case 5: 928 sum1 = b[*r++]; 929 sum2 = b[*r++]; 930 sum3 = b[*r++]; 931 sum4 = b[*r++]; 932 sum5 = b[*r++]; 933 v2 = aa + ai[row+1]; 934 v3 = aa + ai[row+2]; 935 v4 = aa + ai[row+3]; 936 v5 = aa + ai[row+4]; 937 938 for (j=0; j<nz-1; j+=2){ 939 i0 = vi[0]; 940 i1 = vi[1]; 941 vi +=2; 942 tmp0 = tmps[i0]; 943 tmp1 = tmps[i1]; 944 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 945 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 946 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 947 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 948 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 949 } 950 if (j == nz-1){ 951 tmp0 = tmps[*vi++]; 952 sum1 -= *v1++ *tmp0; 953 sum2 -= *v2++ *tmp0; 954 sum3 -= *v3++ *tmp0; 955 sum4 -= *v4++ *tmp0; 956 sum5 -= *v5++ *tmp0; 957 } 958 959 sum2 -= *v2++ * sum1; 960 sum3 -= *v3++ * sum1; 961 sum4 -= *v4++ * sum1; 962 sum5 -= *v5++ * sum1; 963 sum3 -= *v3++ * sum2; 964 sum4 -= *v4++ * sum2; 965 sum5 -= *v5++ * sum2; 966 sum4 -= *v4++ * sum3; 967 sum5 -= *v5++ * sum3; 968 sum5 -= *v5++ * sum4; 969 970 tmp[row ++]=sum1; 971 tmp[row ++]=sum2; 972 tmp[row ++]=sum3; 973 tmp[row ++]=sum4; 974 tmp[row ++]=sum5; 975 break; 976 default: 977 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 978 } 979 } 980 /* backward solve the upper triangular */ 981 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 982 nsz = ns[i]; 983 aii = ai[row+1] -1; 984 v1 = aa + aii; 985 vi = aj + aii; 986 nz = aii- ad[row]; 987 switch (nsz){ /* Each loop in 'case' is unrolled */ 988 case 1 : 989 sum1 = tmp[row]; 990 991 for(j=nz ; j>1; j-=2){ 992 vi -=2; 993 i0 = vi[2]; 994 i1 = vi[1]; 995 tmp0 = tmps[i0]; 996 tmp1 = tmps[i1]; 997 v1 -= 2; 998 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 999 } 1000 if (j==1){ 1001 tmp0 = tmps[*vi--]; 1002 sum1 -= *v1-- * tmp0; 1003 } 1004 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1005 break; 1006 case 2 : 1007 sum1 = tmp[row]; 1008 sum2 = tmp[row -1]; 1009 v2 = aa + ai[row]-1; 1010 for (j=nz ; j>1; j-=2){ 1011 vi -=2; 1012 i0 = vi[2]; 1013 i1 = vi[1]; 1014 tmp0 = tmps[i0]; 1015 tmp1 = tmps[i1]; 1016 v1 -= 2; 1017 v2 -= 2; 1018 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1019 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1020 } 1021 if (j==1){ 1022 tmp0 = tmps[*vi--]; 1023 sum1 -= *v1-- * tmp0; 1024 sum2 -= *v2-- * tmp0; 1025 } 1026 1027 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1028 sum2 -= *v2-- * tmp0; 1029 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1030 break; 1031 case 3 : 1032 sum1 = tmp[row]; 1033 sum2 = tmp[row -1]; 1034 sum3 = tmp[row -2]; 1035 v2 = aa + ai[row]-1; 1036 v3 = aa + ai[row -1]-1; 1037 for (j=nz ; j>1; j-=2){ 1038 vi -=2; 1039 i0 = vi[2]; 1040 i1 = vi[1]; 1041 tmp0 = tmps[i0]; 1042 tmp1 = tmps[i1]; 1043 v1 -= 2; 1044 v2 -= 2; 1045 v3 -= 2; 1046 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1047 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1048 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1049 } 1050 if (j==1){ 1051 tmp0 = tmps[*vi--]; 1052 sum1 -= *v1-- * tmp0; 1053 sum2 -= *v2-- * tmp0; 1054 sum3 -= *v3-- * tmp0; 1055 } 1056 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1057 sum2 -= *v2-- * tmp0; 1058 sum3 -= *v3-- * tmp0; 1059 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1060 sum3 -= *v3-- * tmp0; 1061 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1062 1063 break; 1064 case 4 : 1065 sum1 = tmp[row]; 1066 sum2 = tmp[row -1]; 1067 sum3 = tmp[row -2]; 1068 sum4 = tmp[row -3]; 1069 v2 = aa + ai[row]-1; 1070 v3 = aa + ai[row -1]-1; 1071 v4 = aa + ai[row -2]-1; 1072 1073 for (j=nz ; j>1; j-=2){ 1074 vi -=2; 1075 i0 = vi[2]; 1076 i1 = vi[1]; 1077 tmp0 = tmps[i0]; 1078 tmp1 = tmps[i1]; 1079 v1 -= 2; 1080 v2 -= 2; 1081 v3 -= 2; 1082 v4 -= 2; 1083 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1084 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1085 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1086 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1087 } 1088 if (j==1){ 1089 tmp0 = tmps[*vi--]; 1090 sum1 -= *v1-- * tmp0; 1091 sum2 -= *v2-- * tmp0; 1092 sum3 -= *v3-- * tmp0; 1093 sum4 -= *v4-- * tmp0; 1094 } 1095 1096 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1097 sum2 -= *v2-- * tmp0; 1098 sum3 -= *v3-- * tmp0; 1099 sum4 -= *v4-- * tmp0; 1100 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1104 sum4 -= *v4-- * tmp0; 1105 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1106 break; 1107 case 5 : 1108 sum1 = tmp[row]; 1109 sum2 = tmp[row -1]; 1110 sum3 = tmp[row -2]; 1111 sum4 = tmp[row -3]; 1112 sum5 = tmp[row -4]; 1113 v2 = aa + ai[row]-1; 1114 v3 = aa + ai[row -1]-1; 1115 v4 = aa + ai[row -2]-1; 1116 v5 = aa + ai[row -3]-1; 1117 for (j=nz ; j>1; j-=2){ 1118 vi -= 2; 1119 i0 = vi[2]; 1120 i1 = vi[1]; 1121 tmp0 = tmps[i0]; 1122 tmp1 = tmps[i1]; 1123 v1 -= 2; 1124 v2 -= 2; 1125 v3 -= 2; 1126 v4 -= 2; 1127 v5 -= 2; 1128 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1129 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1130 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1131 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1132 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1133 } 1134 if (j==1){ 1135 tmp0 = tmps[*vi--]; 1136 sum1 -= *v1-- * tmp0; 1137 sum2 -= *v2-- * tmp0; 1138 sum3 -= *v3-- * tmp0; 1139 sum4 -= *v4-- * tmp0; 1140 sum5 -= *v5-- * tmp0; 1141 } 1142 1143 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1144 sum2 -= *v2-- * tmp0; 1145 sum3 -= *v3-- * tmp0; 1146 sum4 -= *v4-- * tmp0; 1147 sum5 -= *v5-- * tmp0; 1148 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1149 sum3 -= *v3-- * tmp0; 1150 sum4 -= *v4-- * tmp0; 1151 sum5 -= *v5-- * tmp0; 1152 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1156 sum5 -= *v5-- * tmp0; 1157 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1158 break; 1159 default: 1160 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1161 } 1162 } 1163 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1164 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1165 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1167 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1168 PetscFunctionReturn(0); 1169 } 1170 1171 /* 1172 Makes a longer coloring[] array and calls the usual code with that 1173 */ 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 1176 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1177 { 1178 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1179 PetscErrorCode ierr; 1180 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1181 PetscInt *colorused,i; 1182 ISColoringValue *newcolor; 1183 1184 PetscFunctionBegin; 1185 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1186 /* loop over inodes, marking a color for each column*/ 1187 row = 0; 1188 for (i=0; i<m; i++){ 1189 for (j=0; j<ns[i]; j++) { 1190 newcolor[row++] = coloring[i] + j*ncolors; 1191 } 1192 } 1193 1194 /* eliminate unneeded colors */ 1195 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1196 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1197 for (i=0; i<n; i++) { 1198 colorused[newcolor[i]] = 1; 1199 } 1200 1201 for (i=1; i<5*ncolors; i++) { 1202 colorused[i] += colorused[i-1]; 1203 } 1204 ncolors = colorused[5*ncolors-1]; 1205 for (i=0; i<n; i++) { 1206 newcolor[i] = colorused[newcolor[i]]-1; 1207 } 1208 ierr = PetscFree(colorused);CHKERRQ(ierr); 1209 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1210 ierr = PetscFree(coloring);CHKERRQ(ierr); 1211 PetscFunctionReturn(0); 1212 } 1213 1214 #include "../src/mat/blockinvert.h" 1215 1216 #undef __FUNCT__ 1217 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 1218 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1219 { 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1221 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1222 MatScalar *ibdiag,*bdiag; 1223 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1224 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1225 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1226 PetscErrorCode ierr; 1227 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1228 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1229 1230 PetscFunctionBegin; 1231 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1232 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1233 if (its > 1) { 1234 /* switch to non-inode version */ 1235 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1236 PetscFunctionReturn(0); 1237 } 1238 1239 if (!a->inode.ibdiagvalid) { 1240 if (!a->inode.ibdiag) { 1241 /* calculate space needed for diagonal blocks */ 1242 for (i=0; i<m; i++) { 1243 cnt += sizes[i]*sizes[i]; 1244 } 1245 a->inode.bdiagsize = cnt; 1246 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1247 } 1248 1249 /* copy over the diagonal blocks and invert them */ 1250 ibdiag = a->inode.ibdiag; 1251 bdiag = a->inode.bdiag; 1252 cnt = 0; 1253 for (i=0, row = 0; i<m; i++) { 1254 for (j=0; j<sizes[i]; j++) { 1255 for (k=0; k<sizes[i]; k++) { 1256 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1257 } 1258 } 1259 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1260 1261 switch(sizes[i]) { 1262 case 1: 1263 /* Create matrix data structure */ 1264 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1265 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1266 break; 1267 case 2: 1268 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1269 break; 1270 case 3: 1271 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1272 break; 1273 case 4: 1274 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1275 break; 1276 case 5: 1277 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1278 break; 1279 default: 1280 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1281 } 1282 cnt += sizes[i]*sizes[i]; 1283 row += sizes[i]; 1284 } 1285 a->inode.ibdiagvalid = PETSC_TRUE; 1286 } 1287 ibdiag = a->inode.ibdiag; 1288 bdiag = a->inode.bdiag; 1289 1290 1291 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1292 if (flag & SOR_ZERO_INITIAL_GUESS) { 1293 PetscScalar *b; 1294 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1295 if (xx != bb) { 1296 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1297 } else { 1298 b = x; 1299 } 1300 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1301 1302 for (i=0, row=0; i<m; i++) { 1303 sz = diag[row] - ii[row]; 1304 v1 = a->a + ii[row]; 1305 idx = a->j + ii[row]; 1306 1307 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1308 switch (sizes[i]){ 1309 case 1: 1310 1311 sum1 = b[row]; 1312 for(n = 0; n<sz-1; n+=2) { 1313 i1 = idx[0]; 1314 i2 = idx[1]; 1315 idx += 2; 1316 tmp0 = x[i1]; 1317 tmp1 = x[i2]; 1318 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1319 } 1320 1321 if (n == sz-1){ 1322 tmp0 = x[*idx]; 1323 sum1 -= *v1 * tmp0; 1324 } 1325 x[row++] = sum1*(*ibdiag++); 1326 break; 1327 case 2: 1328 v2 = a->a + ii[row+1]; 1329 sum1 = b[row]; 1330 sum2 = b[row+1]; 1331 for(n = 0; n<sz-1; n+=2) { 1332 i1 = idx[0]; 1333 i2 = idx[1]; 1334 idx += 2; 1335 tmp0 = x[i1]; 1336 tmp1 = x[i2]; 1337 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1338 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1339 } 1340 1341 if (n == sz-1){ 1342 tmp0 = x[*idx]; 1343 sum1 -= v1[0] * tmp0; 1344 sum2 -= v2[0] * tmp0; 1345 } 1346 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1347 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1348 ibdiag += 4; 1349 break; 1350 case 3: 1351 v2 = a->a + ii[row+1]; 1352 v3 = a->a + ii[row+2]; 1353 sum1 = b[row]; 1354 sum2 = b[row+1]; 1355 sum3 = b[row+2]; 1356 for(n = 0; n<sz-1; n+=2) { 1357 i1 = idx[0]; 1358 i2 = idx[1]; 1359 idx += 2; 1360 tmp0 = x[i1]; 1361 tmp1 = x[i2]; 1362 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1363 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1364 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1365 } 1366 1367 if (n == sz-1){ 1368 tmp0 = x[*idx]; 1369 sum1 -= v1[0] * tmp0; 1370 sum2 -= v2[0] * tmp0; 1371 sum3 -= v3[0] * tmp0; 1372 } 1373 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1374 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1375 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1376 ibdiag += 9; 1377 break; 1378 case 4: 1379 v2 = a->a + ii[row+1]; 1380 v3 = a->a + ii[row+2]; 1381 v4 = a->a + ii[row+3]; 1382 sum1 = b[row]; 1383 sum2 = b[row+1]; 1384 sum3 = b[row+2]; 1385 sum4 = b[row+3]; 1386 for(n = 0; n<sz-1; n+=2) { 1387 i1 = idx[0]; 1388 i2 = idx[1]; 1389 idx += 2; 1390 tmp0 = x[i1]; 1391 tmp1 = x[i2]; 1392 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1393 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1394 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1395 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1396 } 1397 1398 if (n == sz-1){ 1399 tmp0 = x[*idx]; 1400 sum1 -= v1[0] * tmp0; 1401 sum2 -= v2[0] * tmp0; 1402 sum3 -= v3[0] * tmp0; 1403 sum4 -= v4[0] * tmp0; 1404 } 1405 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1406 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1407 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1408 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1409 ibdiag += 16; 1410 break; 1411 case 5: 1412 v2 = a->a + ii[row+1]; 1413 v3 = a->a + ii[row+2]; 1414 v4 = a->a + ii[row+3]; 1415 v5 = a->a + ii[row+4]; 1416 sum1 = b[row]; 1417 sum2 = b[row+1]; 1418 sum3 = b[row+2]; 1419 sum4 = b[row+3]; 1420 sum5 = b[row+4]; 1421 for(n = 0; n<sz-1; n+=2) { 1422 i1 = idx[0]; 1423 i2 = idx[1]; 1424 idx += 2; 1425 tmp0 = x[i1]; 1426 tmp1 = x[i2]; 1427 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1428 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1429 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1430 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1431 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1432 } 1433 1434 if (n == sz-1){ 1435 tmp0 = x[*idx]; 1436 sum1 -= v1[0] * tmp0; 1437 sum2 -= v2[0] * tmp0; 1438 sum3 -= v3[0] * tmp0; 1439 sum4 -= v4[0] * tmp0; 1440 sum5 -= v5[0] * tmp0; 1441 } 1442 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1443 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1444 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1445 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1446 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1447 ibdiag += 25; 1448 break; 1449 default: 1450 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1451 } 1452 } 1453 1454 xb = x; 1455 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1456 } else xb = b; 1457 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1458 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1459 cnt = 0; 1460 for (i=0, row=0; i<m; i++) { 1461 1462 switch (sizes[i]){ 1463 case 1: 1464 x[row++] *= bdiag[cnt++]; 1465 break; 1466 case 2: 1467 x1 = x[row]; x2 = x[row+1]; 1468 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1469 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1470 x[row++] = tmp1; 1471 x[row++] = tmp2; 1472 cnt += 4; 1473 break; 1474 case 3: 1475 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1476 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1477 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1478 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1479 x[row++] = tmp1; 1480 x[row++] = tmp2; 1481 x[row++] = tmp3; 1482 cnt += 9; 1483 break; 1484 case 4: 1485 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1486 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1487 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1488 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1489 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1490 x[row++] = tmp1; 1491 x[row++] = tmp2; 1492 x[row++] = tmp3; 1493 x[row++] = tmp4; 1494 cnt += 16; 1495 break; 1496 case 5: 1497 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1498 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1499 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1500 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1501 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1502 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1503 x[row++] = tmp1; 1504 x[row++] = tmp2; 1505 x[row++] = tmp3; 1506 x[row++] = tmp4; 1507 x[row++] = tmp5; 1508 cnt += 25; 1509 break; 1510 default: 1511 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1512 } 1513 } 1514 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1515 } 1516 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1517 1518 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1519 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1520 ibdiag -= sizes[i]*sizes[i]; 1521 sz = ii[row+1] - diag[row] - 1; 1522 v1 = a->a + diag[row] + 1; 1523 idx = a->j + diag[row] + 1; 1524 1525 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1526 switch (sizes[i]){ 1527 case 1: 1528 1529 sum1 = xb[row]; 1530 for(n = 0; n<sz-1; n+=2) { 1531 i1 = idx[0]; 1532 i2 = idx[1]; 1533 idx += 2; 1534 tmp0 = x[i1]; 1535 tmp1 = x[i2]; 1536 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1537 } 1538 1539 if (n == sz-1){ 1540 tmp0 = x[*idx]; 1541 sum1 -= *v1*tmp0; 1542 } 1543 x[row--] = sum1*(*ibdiag); 1544 break; 1545 1546 case 2: 1547 1548 sum1 = xb[row]; 1549 sum2 = xb[row-1]; 1550 /* note that sum1 is associated with the second of the two rows */ 1551 v2 = a->a + diag[row-1] + 2; 1552 for(n = 0; n<sz-1; n+=2) { 1553 i1 = idx[0]; 1554 i2 = idx[1]; 1555 idx += 2; 1556 tmp0 = x[i1]; 1557 tmp1 = x[i2]; 1558 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1559 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1560 } 1561 1562 if (n == sz-1){ 1563 tmp0 = x[*idx]; 1564 sum1 -= *v1*tmp0; 1565 sum2 -= *v2*tmp0; 1566 } 1567 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1568 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1569 break; 1570 case 3: 1571 1572 sum1 = xb[row]; 1573 sum2 = xb[row-1]; 1574 sum3 = xb[row-2]; 1575 v2 = a->a + diag[row-1] + 2; 1576 v3 = a->a + diag[row-2] + 3; 1577 for(n = 0; n<sz-1; n+=2) { 1578 i1 = idx[0]; 1579 i2 = idx[1]; 1580 idx += 2; 1581 tmp0 = x[i1]; 1582 tmp1 = x[i2]; 1583 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1584 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1585 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1586 } 1587 1588 if (n == sz-1){ 1589 tmp0 = x[*idx]; 1590 sum1 -= *v1*tmp0; 1591 sum2 -= *v2*tmp0; 1592 sum3 -= *v3*tmp0; 1593 } 1594 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1595 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1596 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1597 break; 1598 case 4: 1599 1600 sum1 = xb[row]; 1601 sum2 = xb[row-1]; 1602 sum3 = xb[row-2]; 1603 sum4 = xb[row-3]; 1604 v2 = a->a + diag[row-1] + 2; 1605 v3 = a->a + diag[row-2] + 3; 1606 v4 = a->a + diag[row-3] + 4; 1607 for(n = 0; n<sz-1; n+=2) { 1608 i1 = idx[0]; 1609 i2 = idx[1]; 1610 idx += 2; 1611 tmp0 = x[i1]; 1612 tmp1 = x[i2]; 1613 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1614 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1615 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1616 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1617 } 1618 1619 if (n == sz-1){ 1620 tmp0 = x[*idx]; 1621 sum1 -= *v1*tmp0; 1622 sum2 -= *v2*tmp0; 1623 sum3 -= *v3*tmp0; 1624 sum4 -= *v4*tmp0; 1625 } 1626 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 1627 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 1628 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 1629 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 1630 break; 1631 case 5: 1632 1633 sum1 = xb[row]; 1634 sum2 = xb[row-1]; 1635 sum3 = xb[row-2]; 1636 sum4 = xb[row-3]; 1637 sum5 = xb[row-4]; 1638 v2 = a->a + diag[row-1] + 2; 1639 v3 = a->a + diag[row-2] + 3; 1640 v4 = a->a + diag[row-3] + 4; 1641 v5 = a->a + diag[row-4] + 5; 1642 for(n = 0; n<sz-1; n+=2) { 1643 i1 = idx[0]; 1644 i2 = idx[1]; 1645 idx += 2; 1646 tmp0 = x[i1]; 1647 tmp1 = x[i2]; 1648 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1649 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1650 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1651 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1652 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1653 } 1654 1655 if (n == sz-1){ 1656 tmp0 = x[*idx]; 1657 sum1 -= *v1*tmp0; 1658 sum2 -= *v2*tmp0; 1659 sum3 -= *v3*tmp0; 1660 sum4 -= *v4*tmp0; 1661 sum5 -= *v5*tmp0; 1662 } 1663 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 1664 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 1665 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 1666 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 1667 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 1668 break; 1669 default: 1670 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1671 } 1672 } 1673 1674 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1675 } 1676 its--; 1677 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1678 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 1679 } 1680 if (flag & SOR_EISENSTAT) { 1681 const PetscScalar *b; 1682 MatScalar *t = a->inode.ssor_work; 1683 1684 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1685 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1686 /* 1687 Apply (U + D)^-1 where D is now the block diagonal 1688 */ 1689 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1690 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1691 ibdiag -= sizes[i]*sizes[i]; 1692 sz = ii[row+1] - diag[row] - 1; 1693 v1 = a->a + diag[row] + 1; 1694 idx = a->j + diag[row] + 1; 1695 CHKMEMQ; 1696 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1697 switch (sizes[i]){ 1698 case 1: 1699 1700 sum1 = b[row]; 1701 for(n = 0; n<sz-1; n+=2) { 1702 i1 = idx[0]; 1703 i2 = idx[1]; 1704 idx += 2; 1705 tmp0 = x[i1]; 1706 tmp1 = x[i2]; 1707 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1708 } 1709 1710 if (n == sz-1){ 1711 tmp0 = x[*idx]; 1712 sum1 -= *v1*tmp0; 1713 } 1714 x[row] = sum1*(*ibdiag);row--; 1715 break; 1716 1717 case 2: 1718 1719 sum1 = b[row]; 1720 sum2 = b[row-1]; 1721 /* note that sum1 is associated with the second of the two rows */ 1722 v2 = a->a + diag[row-1] + 2; 1723 for(n = 0; n<sz-1; n+=2) { 1724 i1 = idx[0]; 1725 i2 = idx[1]; 1726 idx += 2; 1727 tmp0 = x[i1]; 1728 tmp1 = x[i2]; 1729 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1730 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1731 } 1732 1733 if (n == sz-1){ 1734 tmp0 = x[*idx]; 1735 sum1 -= *v1*tmp0; 1736 sum2 -= *v2*tmp0; 1737 } 1738 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1739 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1740 row -= 2; 1741 break; 1742 case 3: 1743 1744 sum1 = b[row]; 1745 sum2 = b[row-1]; 1746 sum3 = b[row-2]; 1747 v2 = a->a + diag[row-1] + 2; 1748 v3 = a->a + diag[row-2] + 3; 1749 for(n = 0; n<sz-1; n+=2) { 1750 i1 = idx[0]; 1751 i2 = idx[1]; 1752 idx += 2; 1753 tmp0 = x[i1]; 1754 tmp1 = x[i2]; 1755 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1756 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1757 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1758 } 1759 1760 if (n == sz-1){ 1761 tmp0 = x[*idx]; 1762 sum1 -= *v1*tmp0; 1763 sum2 -= *v2*tmp0; 1764 sum3 -= *v3*tmp0; 1765 } 1766 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1767 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1768 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1769 row -= 3; 1770 break; 1771 case 4: 1772 1773 sum1 = b[row]; 1774 sum2 = b[row-1]; 1775 sum3 = b[row-2]; 1776 sum4 = b[row-3]; 1777 v2 = a->a + diag[row-1] + 2; 1778 v3 = a->a + diag[row-2] + 3; 1779 v4 = a->a + diag[row-3] + 4; 1780 for(n = 0; n<sz-1; n+=2) { 1781 i1 = idx[0]; 1782 i2 = idx[1]; 1783 idx += 2; 1784 tmp0 = x[i1]; 1785 tmp1 = x[i2]; 1786 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1787 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1788 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1789 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1790 } 1791 1792 if (n == sz-1){ 1793 tmp0 = x[*idx]; 1794 sum1 -= *v1*tmp0; 1795 sum2 -= *v2*tmp0; 1796 sum3 -= *v3*tmp0; 1797 sum4 -= *v4*tmp0; 1798 } 1799 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 1800 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 1801 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 1802 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 1803 row -= 4; 1804 break; 1805 case 5: 1806 1807 sum1 = b[row]; 1808 sum2 = b[row-1]; 1809 sum3 = b[row-2]; 1810 sum4 = b[row-3]; 1811 sum5 = b[row-4]; 1812 v2 = a->a + diag[row-1] + 2; 1813 v3 = a->a + diag[row-2] + 3; 1814 v4 = a->a + diag[row-3] + 4; 1815 v5 = a->a + diag[row-4] + 5; 1816 for(n = 0; n<sz-1; n+=2) { 1817 i1 = idx[0]; 1818 i2 = idx[1]; 1819 idx += 2; 1820 tmp0 = x[i1]; 1821 tmp1 = x[i2]; 1822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1824 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1825 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1826 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1827 } 1828 1829 if (n == sz-1){ 1830 tmp0 = x[*idx]; 1831 sum1 -= *v1*tmp0; 1832 sum2 -= *v2*tmp0; 1833 sum3 -= *v3*tmp0; 1834 sum4 -= *v4*tmp0; 1835 sum5 -= *v5*tmp0; 1836 } 1837 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 1838 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 1839 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 1840 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 1841 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 1842 row -= 5; 1843 break; 1844 default: 1845 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1846 } 1847 CHKMEMQ; 1848 } 1849 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1850 1851 /* 1852 t = b - D x where D is the block diagonal 1853 */ 1854 cnt = 0; 1855 for (i=0, row=0; i<m; i++) { 1856 CHKMEMQ; 1857 switch (sizes[i]){ 1858 case 1: 1859 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 1860 break; 1861 case 2: 1862 x1 = x[row]; x2 = x[row+1]; 1863 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1864 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1865 t[row] = b[row] - tmp1; 1866 t[row+1] = b[row+1] - tmp2; row += 2; 1867 cnt += 4; 1868 break; 1869 case 3: 1870 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1871 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1872 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1873 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1874 t[row] = b[row] - tmp1; 1875 t[row+1] = b[row+1] - tmp2; 1876 t[row+2] = b[row+2] - tmp3; row += 3; 1877 cnt += 9; 1878 break; 1879 case 4: 1880 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1881 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1882 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1883 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1884 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1885 t[row] = b[row] - tmp1; 1886 t[row+1] = b[row+1] - tmp2; 1887 t[row+2] = b[row+2] - tmp3; 1888 t[row+3] = b[row+3] - tmp4; row += 4; 1889 cnt += 16; 1890 break; 1891 case 5: 1892 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1893 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1894 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1895 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1896 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1897 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1898 t[row] = b[row] - tmp1; 1899 t[row+1] = b[row+1] - tmp2; 1900 t[row+2] = b[row+2] - tmp3; 1901 t[row+3] = b[row+3] - tmp4; 1902 t[row+4] = b[row+4] - tmp5;row += 5; 1903 cnt += 25; 1904 break; 1905 default: 1906 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1907 } 1908 CHKMEMQ; 1909 } 1910 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1911 1912 1913 1914 /* 1915 Apply (L + D)^-1 where D is the block diagonal 1916 */ 1917 for (i=0, row=0; i<m; i++) { 1918 sz = diag[row] - ii[row]; 1919 v1 = a->a + ii[row]; 1920 idx = a->j + ii[row]; 1921 CHKMEMQ; 1922 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1923 switch (sizes[i]){ 1924 case 1: 1925 1926 sum1 = t[row]; 1927 for(n = 0; n<sz-1; n+=2) { 1928 i1 = idx[0]; 1929 i2 = idx[1]; 1930 idx += 2; 1931 tmp0 = t[i1]; 1932 tmp1 = t[i2]; 1933 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1934 } 1935 1936 if (n == sz-1){ 1937 tmp0 = t[*idx]; 1938 sum1 -= *v1 * tmp0; 1939 } 1940 x[row] += t[row] = sum1*(*ibdiag++); row++; 1941 break; 1942 case 2: 1943 v2 = a->a + ii[row+1]; 1944 sum1 = t[row]; 1945 sum2 = t[row+1]; 1946 for(n = 0; n<sz-1; n+=2) { 1947 i1 = idx[0]; 1948 i2 = idx[1]; 1949 idx += 2; 1950 tmp0 = t[i1]; 1951 tmp1 = t[i2]; 1952 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1953 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1954 } 1955 1956 if (n == sz-1){ 1957 tmp0 = t[*idx]; 1958 sum1 -= v1[0] * tmp0; 1959 sum2 -= v2[0] * tmp0; 1960 } 1961 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1962 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1963 ibdiag += 4; row += 2; 1964 break; 1965 case 3: 1966 v2 = a->a + ii[row+1]; 1967 v3 = a->a + ii[row+2]; 1968 sum1 = t[row]; 1969 sum2 = t[row+1]; 1970 sum3 = t[row+2]; 1971 for(n = 0; n<sz-1; n+=2) { 1972 i1 = idx[0]; 1973 i2 = idx[1]; 1974 idx += 2; 1975 tmp0 = t[i1]; 1976 tmp1 = t[i2]; 1977 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1978 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1979 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1980 } 1981 1982 if (n == sz-1){ 1983 tmp0 = t[*idx]; 1984 sum1 -= v1[0] * tmp0; 1985 sum2 -= v2[0] * tmp0; 1986 sum3 -= v3[0] * tmp0; 1987 } 1988 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1989 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1990 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1991 ibdiag += 9; row += 3; 1992 break; 1993 case 4: 1994 v2 = a->a + ii[row+1]; 1995 v3 = a->a + ii[row+2]; 1996 v4 = a->a + ii[row+3]; 1997 sum1 = t[row]; 1998 sum2 = t[row+1]; 1999 sum3 = t[row+2]; 2000 sum4 = t[row+3]; 2001 for(n = 0; n<sz-1; n+=2) { 2002 i1 = idx[0]; 2003 i2 = idx[1]; 2004 idx += 2; 2005 tmp0 = t[i1]; 2006 tmp1 = t[i2]; 2007 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2008 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2009 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2010 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2011 } 2012 2013 if (n == sz-1){ 2014 tmp0 = t[*idx]; 2015 sum1 -= v1[0] * tmp0; 2016 sum2 -= v2[0] * tmp0; 2017 sum3 -= v3[0] * tmp0; 2018 sum4 -= v4[0] * tmp0; 2019 } 2020 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2021 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2022 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2023 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2024 ibdiag += 16; row += 4; 2025 break; 2026 case 5: 2027 v2 = a->a + ii[row+1]; 2028 v3 = a->a + ii[row+2]; 2029 v4 = a->a + ii[row+3]; 2030 v5 = a->a + ii[row+4]; 2031 sum1 = t[row]; 2032 sum2 = t[row+1]; 2033 sum3 = t[row+2]; 2034 sum4 = t[row+3]; 2035 sum5 = t[row+4]; 2036 for(n = 0; n<sz-1; n+=2) { 2037 i1 = idx[0]; 2038 i2 = idx[1]; 2039 idx += 2; 2040 tmp0 = t[i1]; 2041 tmp1 = t[i2]; 2042 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2043 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2044 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2045 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2046 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2047 } 2048 2049 if (n == sz-1){ 2050 tmp0 = t[*idx]; 2051 sum1 -= v1[0] * tmp0; 2052 sum2 -= v2[0] * tmp0; 2053 sum3 -= v3[0] * tmp0; 2054 sum4 -= v4[0] * tmp0; 2055 sum5 -= v5[0] * tmp0; 2056 } 2057 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2058 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2059 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2060 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2061 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2062 ibdiag += 25; row += 5; 2063 break; 2064 default: 2065 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2066 } 2067 CHKMEMQ; 2068 } 2069 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2070 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2071 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2072 } 2073 PetscFunctionReturn(0); 2074 } 2075 2076 #undef __FUNCT__ 2077 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 2078 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2079 { 2080 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2081 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2082 const MatScalar *bdiag = a->inode.bdiag; 2083 const PetscScalar *b; 2084 PetscErrorCode ierr; 2085 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2086 const PetscInt *sizes = a->inode.size; 2087 2088 PetscFunctionBegin; 2089 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2090 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2091 cnt = 0; 2092 for (i=0, row=0; i<m; i++) { 2093 switch (sizes[i]){ 2094 case 1: 2095 x[row] = b[row]*bdiag[cnt++];row++; 2096 break; 2097 case 2: 2098 x1 = b[row]; x2 = b[row+1]; 2099 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2100 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2101 x[row++] = tmp1; 2102 x[row++] = tmp2; 2103 cnt += 4; 2104 break; 2105 case 3: 2106 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2107 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2108 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2109 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2110 x[row++] = tmp1; 2111 x[row++] = tmp2; 2112 x[row++] = tmp3; 2113 cnt += 9; 2114 break; 2115 case 4: 2116 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2117 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2118 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2119 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2120 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2121 x[row++] = tmp1; 2122 x[row++] = tmp2; 2123 x[row++] = tmp3; 2124 x[row++] = tmp4; 2125 cnt += 16; 2126 break; 2127 case 5: 2128 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2129 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2130 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2131 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2132 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2133 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2134 x[row++] = tmp1; 2135 x[row++] = tmp2; 2136 x[row++] = tmp3; 2137 x[row++] = tmp4; 2138 x[row++] = tmp5; 2139 cnt += 25; 2140 break; 2141 default: 2142 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2143 } 2144 } 2145 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2147 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2148 PetscFunctionReturn(0); 2149 } 2150 2151 /* 2152 samestructure indicates that the matrix has not changed its nonzero structure so we 2153 do not need to recompute the inodes 2154 */ 2155 #undef __FUNCT__ 2156 #define __FUNCT__ "Mat_CheckInode" 2157 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2158 { 2159 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2160 PetscErrorCode ierr; 2161 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2162 PetscTruth flag; 2163 2164 PetscFunctionBegin; 2165 if (!a->inode.use) PetscFunctionReturn(0); 2166 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2167 2168 2169 m = A->rmap->n; 2170 if (a->inode.size) {ns = a->inode.size;} 2171 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2172 2173 i = 0; 2174 node_count = 0; 2175 idx = a->j; 2176 ii = a->i; 2177 while (i < m){ /* For each row */ 2178 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2179 /* Limits the number of elements in a node to 'a->inode.limit' */ 2180 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2181 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2182 if (nzy != nzx) break; 2183 idy += nzx; /* Same nonzero pattern */ 2184 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2185 if (!flag) break; 2186 } 2187 ns[node_count++] = blk_size; 2188 idx += blk_size*nzx; 2189 i = j; 2190 } 2191 /* If not enough inodes found,, do not use inode version of the routines */ 2192 if (!a->inode.size && m && node_count > .9*m) { 2193 ierr = PetscFree(ns);CHKERRQ(ierr); 2194 a->inode.node_count = 0; 2195 a->inode.size = PETSC_NULL; 2196 a->inode.use = PETSC_FALSE; 2197 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2198 } else { 2199 A->ops->mult = MatMult_SeqAIJ_Inode; 2200 A->ops->sor = MatSOR_SeqAIJ_Inode; 2201 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 2202 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 2203 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 2204 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 2205 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 2206 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 2207 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 2208 a->inode.node_count = node_count; 2209 a->inode.size = ns; 2210 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2211 } 2212 PetscFunctionReturn(0); 2213 } 2214 2215 /* 2216 This is really ugly. if inodes are used this replaces the 2217 permutations with ones that correspond to rows/cols of the matrix 2218 rather then inode blocks 2219 */ 2220 #undef __FUNCT__ 2221 #define __FUNCT__ "MatInodeAdjustForInodes" 2222 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2223 { 2224 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2225 2226 PetscFunctionBegin; 2227 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2228 if (f) { 2229 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2230 } 2231 PetscFunctionReturn(0); 2232 } 2233 2234 EXTERN_C_BEGIN 2235 #undef __FUNCT__ 2236 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 2237 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 2238 { 2239 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2240 PetscErrorCode ierr; 2241 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2242 const PetscInt *ridx,*cidx; 2243 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2244 PetscInt nslim_col,*ns_col; 2245 IS ris = *rperm,cis = *cperm; 2246 2247 PetscFunctionBegin; 2248 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2249 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2250 2251 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2252 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2253 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2254 permc = permr + m; 2255 2256 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2257 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2258 2259 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2260 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2261 2262 /* Construct the permutations for rows*/ 2263 for (i=0,row = 0; i<nslim_row; ++i){ 2264 indx = ridx[i]; 2265 start_val = tns[indx]; 2266 end_val = tns[indx + 1]; 2267 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2268 } 2269 2270 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2271 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2272 2273 /* Construct permutations for columns */ 2274 for (i=0,col=0; i<nslim_col; ++i){ 2275 indx = cidx[i]; 2276 start_val = tns[indx]; 2277 end_val = tns[indx + 1]; 2278 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2279 } 2280 2281 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2282 ISSetPermutation(*rperm); 2283 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2284 ISSetPermutation(*cperm); 2285 2286 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2287 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2288 2289 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2290 ierr = PetscFree(permr);CHKERRQ(ierr); 2291 ierr = ISDestroy(cis);CHKERRQ(ierr); 2292 ierr = ISDestroy(ris);CHKERRQ(ierr); 2293 ierr = PetscFree(tns);CHKERRQ(ierr); 2294 PetscFunctionReturn(0); 2295 } 2296 EXTERN_C_END 2297 2298 #undef __FUNCT__ 2299 #define __FUNCT__ "MatInodeGetInodeSizes" 2300 /*@C 2301 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2302 2303 Collective on Mat 2304 2305 Input Parameter: 2306 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2307 2308 Output Parameter: 2309 + node_count - no of inodes present in the matrix. 2310 . sizes - an array of size node_count,with sizes of each inode. 2311 - limit - the max size used to generate the inodes. 2312 2313 Level: advanced 2314 2315 Notes: This routine returns some internal storage information 2316 of the matrix, it is intended to be used by advanced users. 2317 It should be called after the matrix is assembled. 2318 The contents of the sizes[] array should not be changed. 2319 PETSC_NULL may be passed for information not requested. 2320 2321 .keywords: matrix, seqaij, get, inode 2322 2323 .seealso: MatGetInfo() 2324 @*/ 2325 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2326 { 2327 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2328 2329 PetscFunctionBegin; 2330 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2331 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2332 if (f) { 2333 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2334 } 2335 PetscFunctionReturn(0); 2336 } 2337 2338 EXTERN_C_BEGIN 2339 #undef __FUNCT__ 2340 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 2341 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2342 { 2343 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2344 2345 PetscFunctionBegin; 2346 if (node_count) *node_count = a->inode.node_count; 2347 if (sizes) *sizes = a->inode.size; 2348 if (limit) *limit = a->inode.limit; 2349 PetscFunctionReturn(0); 2350 } 2351 EXTERN_C_END 2352