1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 /* ----------------------------------------------------------- */ 1182 #undef __FUNCT__ 1183 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 1184 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 1185 { 1186 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1187 IS iscol = a->col,isrow = a->row; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*c,*rout,*cout; 1190 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 1191 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 1192 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 1193 PetscScalar sum1,sum2,sum3,sum4,sum5; 1194 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 1195 const PetscScalar *b; 1196 1197 PetscFunctionBegin; 1198 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 1199 node_max = a->inode.node_count; 1200 ns = a->inode.size; /* Node Size array */ 1201 1202 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1203 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1204 tmp = a->solve_work; 1205 1206 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1207 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1208 1209 /* forward solve the lower triangular */ 1210 tmps = tmp ; 1211 aa = a_a ; 1212 aj = a_j ; 1213 ad = a->diag; 1214 1215 for (i = 0,row = 0; i< node_max; ++i){ 1216 nsz = ns[i]; 1217 aii = ai[row]; 1218 v1 = aa + aii; 1219 vi = aj + aii; 1220 nz = ai[row+1]- ai[row]; 1221 1222 if (i < node_max-1) { 1223 /* Prefetch the indices for the next block */ 1224 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 1225 /* Prefetch the data for the next block */ 1226 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 1227 } 1228 1229 switch (nsz){ /* Each loop in 'case' is unrolled */ 1230 case 1 : 1231 sum1 = b[r[row]]; 1232 for(j=0; j<nz-1; j+=2){ 1233 i0 = vi[j]; 1234 i1 = vi[j+1]; 1235 tmp0 = tmps[i0]; 1236 tmp1 = tmps[i1]; 1237 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 1238 } 1239 if(j == nz-1){ 1240 tmp0 = tmps[vi[j]]; 1241 sum1 -= v1[j]*tmp0; 1242 } 1243 tmp[row++]=sum1; 1244 break; 1245 case 2: 1246 sum1 = b[r[row]]; 1247 sum2 = b[r[row+1]]; 1248 v2 = aa + ai[row+1]; 1249 1250 for(j=0; j<nz-1; j+=2){ 1251 i0 = vi[j]; 1252 i1 = vi[j+1]; 1253 tmp0 = tmps[i0]; 1254 tmp1 = tmps[i1]; 1255 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1256 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1257 } 1258 if(j == nz-1){ 1259 tmp0 = tmps[vi[j]]; 1260 sum1 -= v1[j] *tmp0; 1261 sum2 -= v2[j] *tmp0; 1262 } 1263 sum2 -= v2[nz] * sum1; 1264 tmp[row ++]=sum1; 1265 tmp[row ++]=sum2; 1266 break; 1267 case 3: 1268 sum1 = b[r[row]]; 1269 sum2 = b[r[row+1]]; 1270 sum3 = b[r[row+2]]; 1271 v2 = aa + ai[row+1]; 1272 v3 = aa + ai[row+2]; 1273 1274 for (j=0; j<nz-1; j+=2){ 1275 i0 = vi[j]; 1276 i1 = vi[j+1]; 1277 tmp0 = tmps[i0]; 1278 tmp1 = tmps[i1]; 1279 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1280 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1281 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1282 } 1283 if (j == nz-1){ 1284 tmp0 = tmps[vi[j]]; 1285 sum1 -= v1[j] *tmp0; 1286 sum2 -= v2[j] *tmp0; 1287 sum3 -= v3[j] *tmp0; 1288 } 1289 sum2 -= v2[nz] * sum1; 1290 sum3 -= v3[nz] * sum1; 1291 sum3 -= v3[nz+1] * sum2; 1292 tmp[row ++]=sum1; 1293 tmp[row ++]=sum2; 1294 tmp[row ++]=sum3; 1295 break; 1296 1297 case 4: 1298 sum1 = b[r[row]]; 1299 sum2 = b[r[row+1]]; 1300 sum3 = b[r[row+2]]; 1301 sum4 = b[r[row+3]]; 1302 v2 = aa + ai[row+1]; 1303 v3 = aa + ai[row+2]; 1304 v4 = aa + ai[row+3]; 1305 1306 for (j=0; j<nz-1; j+=2){ 1307 i0 = vi[j]; 1308 i1 = vi[j+1]; 1309 tmp0 = tmps[i0]; 1310 tmp1 = tmps[i1]; 1311 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1312 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1313 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1314 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1315 } 1316 if (j == nz-1){ 1317 tmp0 = tmps[vi[j]]; 1318 sum1 -= v1[j] *tmp0; 1319 sum2 -= v2[j] *tmp0; 1320 sum3 -= v3[j] *tmp0; 1321 sum4 -= v4[j] *tmp0; 1322 } 1323 sum2 -= v2[nz] * sum1; 1324 sum3 -= v3[nz] * sum1; 1325 sum4 -= v4[nz] * sum1; 1326 sum3 -= v3[nz+1] * sum2; 1327 sum4 -= v4[nz+1] * sum2; 1328 sum4 -= v4[nz+2] * sum3; 1329 1330 tmp[row ++]=sum1; 1331 tmp[row ++]=sum2; 1332 tmp[row ++]=sum3; 1333 tmp[row ++]=sum4; 1334 break; 1335 case 5: 1336 sum1 = b[r[row]]; 1337 sum2 = b[r[row+1]]; 1338 sum3 = b[r[row+2]]; 1339 sum4 = b[r[row+3]]; 1340 sum5 = b[r[row+4]]; 1341 v2 = aa + ai[row+1]; 1342 v3 = aa + ai[row+2]; 1343 v4 = aa + ai[row+3]; 1344 v5 = aa + ai[row+4]; 1345 1346 for (j=0; j<nz-1; j+=2){ 1347 i0 = vi[j]; 1348 i1 = vi[j+1]; 1349 tmp0 = tmps[i0]; 1350 tmp1 = tmps[i1]; 1351 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1352 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1353 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1354 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1355 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 1356 } 1357 if (j == nz-1){ 1358 tmp0 = tmps[vi[j]]; 1359 sum1 -= v1[j] *tmp0; 1360 sum2 -= v2[j] *tmp0; 1361 sum3 -= v3[j] *tmp0; 1362 sum4 -= v4[j] *tmp0; 1363 sum5 -= v5[j] *tmp0; 1364 } 1365 1366 sum2 -= v2[nz] * sum1; 1367 sum3 -= v3[nz] * sum1; 1368 sum4 -= v4[nz] * sum1; 1369 sum5 -= v5[nz] * sum1; 1370 sum3 -= v3[nz+1] * sum2; 1371 sum4 -= v4[nz+1] * sum2; 1372 sum5 -= v5[nz+1] * sum2; 1373 sum4 -= v4[nz+2] * sum3; 1374 sum5 -= v5[nz+2] * sum3; 1375 sum5 -= v5[nz+3] * sum4; 1376 1377 tmp[row ++]=sum1; 1378 tmp[row ++]=sum2; 1379 tmp[row ++]=sum3; 1380 tmp[row ++]=sum4; 1381 tmp[row ++]=sum5; 1382 break; 1383 default: 1384 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1385 } 1386 } 1387 /* backward solve the upper triangular */ 1388 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 1389 nsz = ns[i]; 1390 aii = ad[row+1] + 1; 1391 v1 = aa + aii; 1392 vi = aj + aii; 1393 nz = ad[row]- ad[row+1] - 1; 1394 1395 if (i > 0) { 1396 /* Prefetch the indices for the next block */ 1397 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 1398 /* Prefetch the data for the next block */ 1399 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 1400 } 1401 1402 switch (nsz){ /* Each loop in 'case' is unrolled */ 1403 case 1 : 1404 sum1 = tmp[row]; 1405 1406 for(j=0 ; j<nz-1; j+=2){ 1407 i0 = vi[j]; 1408 i1 = vi[j+1]; 1409 tmp0 = tmps[i0]; 1410 tmp1 = tmps[i1]; 1411 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1412 } 1413 if (j == nz-1){ 1414 tmp0 = tmps[vi[j]]; 1415 sum1 -= v1[j]*tmp0; 1416 } 1417 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1418 break; 1419 case 2 : 1420 sum1 = tmp[row]; 1421 sum2 = tmp[row-1]; 1422 v2 = aa + ad[row] + 1; 1423 for (j=0 ; j<nz-1; j+=2){ 1424 i0 = vi[j]; 1425 i1 = vi[j+1]; 1426 tmp0 = tmps[i0]; 1427 tmp1 = tmps[i1]; 1428 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1429 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1430 } 1431 if (j == nz-1){ 1432 tmp0 = tmps[vi[j]]; 1433 sum1 -= v1[j]* tmp0; 1434 sum2 -= v2[j+1]* tmp0; 1435 } 1436 1437 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1438 sum2 -= v2[0] * tmp0; 1439 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1440 break; 1441 case 3 : 1442 sum1 = tmp[row]; 1443 sum2 = tmp[row -1]; 1444 sum3 = tmp[row -2]; 1445 v2 = aa + ad[row] + 1; 1446 v3 = aa + ad[row -1] + 1; 1447 for (j=0 ; j<nz-1; j+=2){ 1448 i0 = vi[j]; 1449 i1 = vi[j+1]; 1450 tmp0 = tmps[i0]; 1451 tmp1 = tmps[i1]; 1452 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1453 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1454 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1455 } 1456 if (j== nz-1){ 1457 tmp0 = tmps[vi[j]]; 1458 sum1 -= v1[j] * tmp0; 1459 sum2 -= v2[j+1] * tmp0; 1460 sum3 -= v3[j+2] * tmp0; 1461 } 1462 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1463 sum2 -= v2[0]* tmp0; 1464 sum3 -= v3[1] * tmp0; 1465 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1466 sum3 -= v3[0]* tmp0; 1467 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1468 1469 break; 1470 case 4 : 1471 sum1 = tmp[row]; 1472 sum2 = tmp[row -1]; 1473 sum3 = tmp[row -2]; 1474 sum4 = tmp[row -3]; 1475 v2 = aa + ad[row]+1; 1476 v3 = aa + ad[row -1]+1; 1477 v4 = aa + ad[row -2]+1; 1478 1479 for (j=0 ; j<nz-1; j+=2){ 1480 i0 = vi[j]; 1481 i1 = vi[j+1]; 1482 tmp0 = tmps[i0]; 1483 tmp1 = tmps[i1]; 1484 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1485 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1486 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1487 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1488 } 1489 if (j== nz-1){ 1490 tmp0 = tmps[vi[j]]; 1491 sum1 -= v1[j] * tmp0; 1492 sum2 -= v2[j+1] * tmp0; 1493 sum3 -= v3[j+2] * tmp0; 1494 sum4 -= v4[j+3] * tmp0; 1495 } 1496 1497 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1498 sum2 -= v2[0] * tmp0; 1499 sum3 -= v3[1] * tmp0; 1500 sum4 -= v4[2] * tmp0; 1501 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1502 sum3 -= v3[0] * tmp0; 1503 sum4 -= v4[1] * tmp0; 1504 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1505 sum4 -= v4[0] * tmp0; 1506 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1507 break; 1508 case 5 : 1509 sum1 = tmp[row]; 1510 sum2 = tmp[row -1]; 1511 sum3 = tmp[row -2]; 1512 sum4 = tmp[row -3]; 1513 sum5 = tmp[row -4]; 1514 v2 = aa + ad[row]+1; 1515 v3 = aa + ad[row -1]+1; 1516 v4 = aa + ad[row -2]+1; 1517 v5 = aa + ad[row -3]+1; 1518 for (j=0 ; j<nz-1; j+=2){ 1519 i0 = vi[j]; 1520 i1 = vi[j+1]; 1521 tmp0 = tmps[i0]; 1522 tmp1 = tmps[i1]; 1523 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1524 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1525 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1526 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1527 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 1528 } 1529 if (j==nz-1){ 1530 tmp0 = tmps[vi[j]]; 1531 sum1 -= v1[j] * tmp0; 1532 sum2 -= v2[j+1] * tmp0; 1533 sum3 -= v3[j+2] * tmp0; 1534 sum4 -= v4[j+3] * tmp0; 1535 sum5 -= v5[j+4] * tmp0; 1536 } 1537 1538 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1539 sum2 -= v2[0] * tmp0; 1540 sum3 -= v3[1] * tmp0; 1541 sum4 -= v4[2] * tmp0; 1542 sum5 -= v5[3] * tmp0; 1543 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1544 sum3 -= v3[0] * tmp0; 1545 sum4 -= v4[1] * tmp0; 1546 sum5 -= v5[2] * tmp0; 1547 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1548 sum4 -= v4[0] * tmp0; 1549 sum5 -= v5[1] * tmp0; 1550 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1551 sum5 -= v5[0] * tmp0; 1552 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 1553 break; 1554 default: 1555 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1556 } 1557 } 1558 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1559 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1560 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1561 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1562 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1563 PetscFunctionReturn(0); 1564 } 1565 1566 1567 /* 1568 Makes a longer coloring[] array and calls the usual code with that 1569 */ 1570 #undef __FUNCT__ 1571 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 1572 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1573 { 1574 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1575 PetscErrorCode ierr; 1576 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1577 PetscInt *colorused,i; 1578 ISColoringValue *newcolor; 1579 1580 PetscFunctionBegin; 1581 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1582 /* loop over inodes, marking a color for each column*/ 1583 row = 0; 1584 for (i=0; i<m; i++){ 1585 for (j=0; j<ns[i]; j++) { 1586 newcolor[row++] = coloring[i] + j*ncolors; 1587 } 1588 } 1589 1590 /* eliminate unneeded colors */ 1591 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1592 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1593 for (i=0; i<n; i++) { 1594 colorused[newcolor[i]] = 1; 1595 } 1596 1597 for (i=1; i<5*ncolors; i++) { 1598 colorused[i] += colorused[i-1]; 1599 } 1600 ncolors = colorused[5*ncolors-1]; 1601 for (i=0; i<n; i++) { 1602 newcolor[i] = colorused[newcolor[i]]-1; 1603 } 1604 ierr = PetscFree(colorused);CHKERRQ(ierr); 1605 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1606 ierr = PetscFree(coloring);CHKERRQ(ierr); 1607 PetscFunctionReturn(0); 1608 } 1609 1610 #include "../src/mat/blockinvert.h" 1611 1612 #undef __FUNCT__ 1613 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 1614 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1615 { 1616 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1617 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1618 MatScalar *ibdiag,*bdiag,work[25]; 1619 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1620 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1621 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1622 PetscErrorCode ierr; 1623 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1624 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 1625 1626 PetscFunctionBegin; 1627 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1628 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1629 if (its > 1) { 1630 /* switch to non-inode version */ 1631 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1632 PetscFunctionReturn(0); 1633 } 1634 1635 if (!a->inode.ibdiagvalid) { 1636 if (!a->inode.ibdiag) { 1637 /* calculate space needed for diagonal blocks */ 1638 for (i=0; i<m; i++) { 1639 cnt += sizes[i]*sizes[i]; 1640 } 1641 a->inode.bdiagsize = cnt; 1642 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1643 } 1644 1645 /* copy over the diagonal blocks and invert them */ 1646 ibdiag = a->inode.ibdiag; 1647 bdiag = a->inode.bdiag; 1648 cnt = 0; 1649 for (i=0, row = 0; i<m; i++) { 1650 for (j=0; j<sizes[i]; j++) { 1651 for (k=0; k<sizes[i]; k++) { 1652 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1653 } 1654 } 1655 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1656 1657 switch(sizes[i]) { 1658 case 1: 1659 /* Create matrix data structure */ 1660 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1661 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1662 break; 1663 case 2: 1664 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1665 break; 1666 case 3: 1667 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1668 break; 1669 case 4: 1670 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1671 break; 1672 case 5: 1673 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 1674 break; 1675 default: 1676 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1677 } 1678 cnt += sizes[i]*sizes[i]; 1679 row += sizes[i]; 1680 } 1681 a->inode.ibdiagvalid = PETSC_TRUE; 1682 } 1683 ibdiag = a->inode.ibdiag; 1684 bdiag = a->inode.bdiag; 1685 1686 1687 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1688 if (flag & SOR_ZERO_INITIAL_GUESS) { 1689 PetscScalar *b; 1690 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1691 if (xx != bb) { 1692 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1693 } else { 1694 b = x; 1695 } 1696 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1697 1698 for (i=0, row=0; i<m; i++) { 1699 sz = diag[row] - ii[row]; 1700 v1 = a->a + ii[row]; 1701 idx = a->j + ii[row]; 1702 1703 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1704 switch (sizes[i]){ 1705 case 1: 1706 1707 sum1 = b[row]; 1708 for(n = 0; n<sz-1; n+=2) { 1709 i1 = idx[0]; 1710 i2 = idx[1]; 1711 idx += 2; 1712 tmp0 = x[i1]; 1713 tmp1 = x[i2]; 1714 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1715 } 1716 1717 if (n == sz-1){ 1718 tmp0 = x[*idx]; 1719 sum1 -= *v1 * tmp0; 1720 } 1721 x[row++] = sum1*(*ibdiag++); 1722 break; 1723 case 2: 1724 v2 = a->a + ii[row+1]; 1725 sum1 = b[row]; 1726 sum2 = b[row+1]; 1727 for(n = 0; n<sz-1; n+=2) { 1728 i1 = idx[0]; 1729 i2 = idx[1]; 1730 idx += 2; 1731 tmp0 = x[i1]; 1732 tmp1 = x[i2]; 1733 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1734 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1735 } 1736 1737 if (n == sz-1){ 1738 tmp0 = x[*idx]; 1739 sum1 -= v1[0] * tmp0; 1740 sum2 -= v2[0] * tmp0; 1741 } 1742 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1743 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1744 ibdiag += 4; 1745 break; 1746 case 3: 1747 v2 = a->a + ii[row+1]; 1748 v3 = a->a + ii[row+2]; 1749 sum1 = b[row]; 1750 sum2 = b[row+1]; 1751 sum3 = b[row+2]; 1752 for(n = 0; n<sz-1; n+=2) { 1753 i1 = idx[0]; 1754 i2 = idx[1]; 1755 idx += 2; 1756 tmp0 = x[i1]; 1757 tmp1 = x[i2]; 1758 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1759 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1760 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1761 } 1762 1763 if (n == sz-1){ 1764 tmp0 = x[*idx]; 1765 sum1 -= v1[0] * tmp0; 1766 sum2 -= v2[0] * tmp0; 1767 sum3 -= v3[0] * tmp0; 1768 } 1769 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1770 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1771 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1772 ibdiag += 9; 1773 break; 1774 case 4: 1775 v2 = a->a + ii[row+1]; 1776 v3 = a->a + ii[row+2]; 1777 v4 = a->a + ii[row+3]; 1778 sum1 = b[row]; 1779 sum2 = b[row+1]; 1780 sum3 = b[row+2]; 1781 sum4 = b[row+3]; 1782 for(n = 0; n<sz-1; n+=2) { 1783 i1 = idx[0]; 1784 i2 = idx[1]; 1785 idx += 2; 1786 tmp0 = x[i1]; 1787 tmp1 = x[i2]; 1788 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1789 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1790 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1791 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1792 } 1793 1794 if (n == sz-1){ 1795 tmp0 = x[*idx]; 1796 sum1 -= v1[0] * tmp0; 1797 sum2 -= v2[0] * tmp0; 1798 sum3 -= v3[0] * tmp0; 1799 sum4 -= v4[0] * tmp0; 1800 } 1801 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1802 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1803 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1804 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1805 ibdiag += 16; 1806 break; 1807 case 5: 1808 v2 = a->a + ii[row+1]; 1809 v3 = a->a + ii[row+2]; 1810 v4 = a->a + ii[row+3]; 1811 v5 = a->a + ii[row+4]; 1812 sum1 = b[row]; 1813 sum2 = b[row+1]; 1814 sum3 = b[row+2]; 1815 sum4 = b[row+3]; 1816 sum5 = b[row+4]; 1817 for(n = 0; n<sz-1; n+=2) { 1818 i1 = idx[0]; 1819 i2 = idx[1]; 1820 idx += 2; 1821 tmp0 = x[i1]; 1822 tmp1 = x[i2]; 1823 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1824 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1825 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1826 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1827 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1828 } 1829 1830 if (n == sz-1){ 1831 tmp0 = x[*idx]; 1832 sum1 -= v1[0] * tmp0; 1833 sum2 -= v2[0] * tmp0; 1834 sum3 -= v3[0] * tmp0; 1835 sum4 -= v4[0] * tmp0; 1836 sum5 -= v5[0] * tmp0; 1837 } 1838 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1839 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1840 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1841 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1842 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1843 ibdiag += 25; 1844 break; 1845 default: 1846 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1847 } 1848 } 1849 1850 xb = x; 1851 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1852 } else xb = b; 1853 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1854 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1855 cnt = 0; 1856 for (i=0, row=0; i<m; i++) { 1857 1858 switch (sizes[i]){ 1859 case 1: 1860 x[row++] *= bdiag[cnt++]; 1861 break; 1862 case 2: 1863 x1 = x[row]; x2 = x[row+1]; 1864 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1865 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1866 x[row++] = tmp1; 1867 x[row++] = tmp2; 1868 cnt += 4; 1869 break; 1870 case 3: 1871 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1872 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1873 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1874 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1875 x[row++] = tmp1; 1876 x[row++] = tmp2; 1877 x[row++] = tmp3; 1878 cnt += 9; 1879 break; 1880 case 4: 1881 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1882 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1883 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1884 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1885 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1886 x[row++] = tmp1; 1887 x[row++] = tmp2; 1888 x[row++] = tmp3; 1889 x[row++] = tmp4; 1890 cnt += 16; 1891 break; 1892 case 5: 1893 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1894 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1895 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1896 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1897 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1898 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1899 x[row++] = tmp1; 1900 x[row++] = tmp2; 1901 x[row++] = tmp3; 1902 x[row++] = tmp4; 1903 x[row++] = tmp5; 1904 cnt += 25; 1905 break; 1906 default: 1907 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1908 } 1909 } 1910 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1911 } 1912 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1913 1914 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1915 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1916 ibdiag -= sizes[i]*sizes[i]; 1917 sz = ii[row+1] - diag[row] - 1; 1918 v1 = a->a + diag[row] + 1; 1919 idx = a->j + diag[row] + 1; 1920 1921 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1922 switch (sizes[i]){ 1923 case 1: 1924 1925 sum1 = xb[row]; 1926 for(n = 0; n<sz-1; n+=2) { 1927 i1 = idx[0]; 1928 i2 = idx[1]; 1929 idx += 2; 1930 tmp0 = x[i1]; 1931 tmp1 = x[i2]; 1932 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1933 } 1934 1935 if (n == sz-1){ 1936 tmp0 = x[*idx]; 1937 sum1 -= *v1*tmp0; 1938 } 1939 x[row--] = sum1*(*ibdiag); 1940 break; 1941 1942 case 2: 1943 1944 sum1 = xb[row]; 1945 sum2 = xb[row-1]; 1946 /* note that sum1 is associated with the second of the two rows */ 1947 v2 = a->a + diag[row-1] + 2; 1948 for(n = 0; n<sz-1; n+=2) { 1949 i1 = idx[0]; 1950 i2 = idx[1]; 1951 idx += 2; 1952 tmp0 = x[i1]; 1953 tmp1 = x[i2]; 1954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1956 } 1957 1958 if (n == sz-1){ 1959 tmp0 = x[*idx]; 1960 sum1 -= *v1*tmp0; 1961 sum2 -= *v2*tmp0; 1962 } 1963 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1964 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1965 break; 1966 case 3: 1967 1968 sum1 = xb[row]; 1969 sum2 = xb[row-1]; 1970 sum3 = xb[row-2]; 1971 v2 = a->a + diag[row-1] + 2; 1972 v3 = a->a + diag[row-2] + 3; 1973 for(n = 0; n<sz-1; n+=2) { 1974 i1 = idx[0]; 1975 i2 = idx[1]; 1976 idx += 2; 1977 tmp0 = x[i1]; 1978 tmp1 = x[i2]; 1979 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1980 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1981 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1982 } 1983 1984 if (n == sz-1){ 1985 tmp0 = x[*idx]; 1986 sum1 -= *v1*tmp0; 1987 sum2 -= *v2*tmp0; 1988 sum3 -= *v3*tmp0; 1989 } 1990 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1991 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1992 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1993 break; 1994 case 4: 1995 1996 sum1 = xb[row]; 1997 sum2 = xb[row-1]; 1998 sum3 = xb[row-2]; 1999 sum4 = xb[row-3]; 2000 v2 = a->a + diag[row-1] + 2; 2001 v3 = a->a + diag[row-2] + 3; 2002 v4 = a->a + diag[row-3] + 4; 2003 for(n = 0; n<sz-1; n+=2) { 2004 i1 = idx[0]; 2005 i2 = idx[1]; 2006 idx += 2; 2007 tmp0 = x[i1]; 2008 tmp1 = x[i2]; 2009 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2010 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2011 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2012 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2013 } 2014 2015 if (n == sz-1){ 2016 tmp0 = x[*idx]; 2017 sum1 -= *v1*tmp0; 2018 sum2 -= *v2*tmp0; 2019 sum3 -= *v3*tmp0; 2020 sum4 -= *v4*tmp0; 2021 } 2022 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2023 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2024 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2025 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2026 break; 2027 case 5: 2028 2029 sum1 = xb[row]; 2030 sum2 = xb[row-1]; 2031 sum3 = xb[row-2]; 2032 sum4 = xb[row-3]; 2033 sum5 = xb[row-4]; 2034 v2 = a->a + diag[row-1] + 2; 2035 v3 = a->a + diag[row-2] + 3; 2036 v4 = a->a + diag[row-3] + 4; 2037 v5 = a->a + diag[row-4] + 5; 2038 for(n = 0; n<sz-1; n+=2) { 2039 i1 = idx[0]; 2040 i2 = idx[1]; 2041 idx += 2; 2042 tmp0 = x[i1]; 2043 tmp1 = x[i2]; 2044 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2045 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2046 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2047 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2048 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2049 } 2050 2051 if (n == sz-1){ 2052 tmp0 = x[*idx]; 2053 sum1 -= *v1*tmp0; 2054 sum2 -= *v2*tmp0; 2055 sum3 -= *v3*tmp0; 2056 sum4 -= *v4*tmp0; 2057 sum5 -= *v5*tmp0; 2058 } 2059 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2060 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2061 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2062 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2063 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2064 break; 2065 default: 2066 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2067 } 2068 } 2069 2070 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2071 } 2072 its--; 2073 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2074 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2075 } 2076 if (flag & SOR_EISENSTAT) { 2077 const PetscScalar *b; 2078 MatScalar *t = a->inode.ssor_work; 2079 2080 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2081 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2082 /* 2083 Apply (U + D)^-1 where D is now the block diagonal 2084 */ 2085 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2086 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2087 ibdiag -= sizes[i]*sizes[i]; 2088 sz = ii[row+1] - diag[row] - 1; 2089 v1 = a->a + diag[row] + 1; 2090 idx = a->j + diag[row] + 1; 2091 CHKMEMQ; 2092 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2093 switch (sizes[i]){ 2094 case 1: 2095 2096 sum1 = b[row]; 2097 for(n = 0; n<sz-1; n+=2) { 2098 i1 = idx[0]; 2099 i2 = idx[1]; 2100 idx += 2; 2101 tmp0 = x[i1]; 2102 tmp1 = x[i2]; 2103 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2104 } 2105 2106 if (n == sz-1){ 2107 tmp0 = x[*idx]; 2108 sum1 -= *v1*tmp0; 2109 } 2110 x[row] = sum1*(*ibdiag);row--; 2111 break; 2112 2113 case 2: 2114 2115 sum1 = b[row]; 2116 sum2 = b[row-1]; 2117 /* note that sum1 is associated with the second of the two rows */ 2118 v2 = a->a + diag[row-1] + 2; 2119 for(n = 0; n<sz-1; n+=2) { 2120 i1 = idx[0]; 2121 i2 = idx[1]; 2122 idx += 2; 2123 tmp0 = x[i1]; 2124 tmp1 = x[i2]; 2125 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2126 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2127 } 2128 2129 if (n == sz-1){ 2130 tmp0 = x[*idx]; 2131 sum1 -= *v1*tmp0; 2132 sum2 -= *v2*tmp0; 2133 } 2134 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2135 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2136 row -= 2; 2137 break; 2138 case 3: 2139 2140 sum1 = b[row]; 2141 sum2 = b[row-1]; 2142 sum3 = b[row-2]; 2143 v2 = a->a + diag[row-1] + 2; 2144 v3 = a->a + diag[row-2] + 3; 2145 for(n = 0; n<sz-1; n+=2) { 2146 i1 = idx[0]; 2147 i2 = idx[1]; 2148 idx += 2; 2149 tmp0 = x[i1]; 2150 tmp1 = x[i2]; 2151 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2152 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2153 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2154 } 2155 2156 if (n == sz-1){ 2157 tmp0 = x[*idx]; 2158 sum1 -= *v1*tmp0; 2159 sum2 -= *v2*tmp0; 2160 sum3 -= *v3*tmp0; 2161 } 2162 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2163 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2164 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2165 row -= 3; 2166 break; 2167 case 4: 2168 2169 sum1 = b[row]; 2170 sum2 = b[row-1]; 2171 sum3 = b[row-2]; 2172 sum4 = b[row-3]; 2173 v2 = a->a + diag[row-1] + 2; 2174 v3 = a->a + diag[row-2] + 3; 2175 v4 = a->a + diag[row-3] + 4; 2176 for(n = 0; n<sz-1; n+=2) { 2177 i1 = idx[0]; 2178 i2 = idx[1]; 2179 idx += 2; 2180 tmp0 = x[i1]; 2181 tmp1 = x[i2]; 2182 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2183 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2184 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2185 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2186 } 2187 2188 if (n == sz-1){ 2189 tmp0 = x[*idx]; 2190 sum1 -= *v1*tmp0; 2191 sum2 -= *v2*tmp0; 2192 sum3 -= *v3*tmp0; 2193 sum4 -= *v4*tmp0; 2194 } 2195 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2196 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2197 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2198 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2199 row -= 4; 2200 break; 2201 case 5: 2202 2203 sum1 = b[row]; 2204 sum2 = b[row-1]; 2205 sum3 = b[row-2]; 2206 sum4 = b[row-3]; 2207 sum5 = b[row-4]; 2208 v2 = a->a + diag[row-1] + 2; 2209 v3 = a->a + diag[row-2] + 3; 2210 v4 = a->a + diag[row-3] + 4; 2211 v5 = a->a + diag[row-4] + 5; 2212 for(n = 0; n<sz-1; n+=2) { 2213 i1 = idx[0]; 2214 i2 = idx[1]; 2215 idx += 2; 2216 tmp0 = x[i1]; 2217 tmp1 = x[i2]; 2218 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2219 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2220 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2221 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2222 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2223 } 2224 2225 if (n == sz-1){ 2226 tmp0 = x[*idx]; 2227 sum1 -= *v1*tmp0; 2228 sum2 -= *v2*tmp0; 2229 sum3 -= *v3*tmp0; 2230 sum4 -= *v4*tmp0; 2231 sum5 -= *v5*tmp0; 2232 } 2233 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2234 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2235 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2236 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2237 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2238 row -= 5; 2239 break; 2240 default: 2241 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2242 } 2243 CHKMEMQ; 2244 } 2245 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2246 2247 /* 2248 t = b - D x where D is the block diagonal 2249 */ 2250 cnt = 0; 2251 for (i=0, row=0; i<m; i++) { 2252 CHKMEMQ; 2253 switch (sizes[i]){ 2254 case 1: 2255 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2256 break; 2257 case 2: 2258 x1 = x[row]; x2 = x[row+1]; 2259 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2260 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2261 t[row] = b[row] - tmp1; 2262 t[row+1] = b[row+1] - tmp2; row += 2; 2263 cnt += 4; 2264 break; 2265 case 3: 2266 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2267 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2268 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2269 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2270 t[row] = b[row] - tmp1; 2271 t[row+1] = b[row+1] - tmp2; 2272 t[row+2] = b[row+2] - tmp3; row += 3; 2273 cnt += 9; 2274 break; 2275 case 4: 2276 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2277 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2278 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2279 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2280 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2281 t[row] = b[row] - tmp1; 2282 t[row+1] = b[row+1] - tmp2; 2283 t[row+2] = b[row+2] - tmp3; 2284 t[row+3] = b[row+3] - tmp4; row += 4; 2285 cnt += 16; 2286 break; 2287 case 5: 2288 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2289 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2290 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2291 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2292 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2293 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2294 t[row] = b[row] - tmp1; 2295 t[row+1] = b[row+1] - tmp2; 2296 t[row+2] = b[row+2] - tmp3; 2297 t[row+3] = b[row+3] - tmp4; 2298 t[row+4] = b[row+4] - tmp5;row += 5; 2299 cnt += 25; 2300 break; 2301 default: 2302 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2303 } 2304 CHKMEMQ; 2305 } 2306 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2307 2308 2309 2310 /* 2311 Apply (L + D)^-1 where D is the block diagonal 2312 */ 2313 for (i=0, row=0; i<m; i++) { 2314 sz = diag[row] - ii[row]; 2315 v1 = a->a + ii[row]; 2316 idx = a->j + ii[row]; 2317 CHKMEMQ; 2318 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2319 switch (sizes[i]){ 2320 case 1: 2321 2322 sum1 = t[row]; 2323 for(n = 0; n<sz-1; n+=2) { 2324 i1 = idx[0]; 2325 i2 = idx[1]; 2326 idx += 2; 2327 tmp0 = t[i1]; 2328 tmp1 = t[i2]; 2329 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2330 } 2331 2332 if (n == sz-1){ 2333 tmp0 = t[*idx]; 2334 sum1 -= *v1 * tmp0; 2335 } 2336 x[row] += t[row] = sum1*(*ibdiag++); row++; 2337 break; 2338 case 2: 2339 v2 = a->a + ii[row+1]; 2340 sum1 = t[row]; 2341 sum2 = t[row+1]; 2342 for(n = 0; n<sz-1; n+=2) { 2343 i1 = idx[0]; 2344 i2 = idx[1]; 2345 idx += 2; 2346 tmp0 = t[i1]; 2347 tmp1 = t[i2]; 2348 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2349 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2350 } 2351 2352 if (n == sz-1){ 2353 tmp0 = t[*idx]; 2354 sum1 -= v1[0] * tmp0; 2355 sum2 -= v2[0] * tmp0; 2356 } 2357 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2358 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2359 ibdiag += 4; row += 2; 2360 break; 2361 case 3: 2362 v2 = a->a + ii[row+1]; 2363 v3 = a->a + ii[row+2]; 2364 sum1 = t[row]; 2365 sum2 = t[row+1]; 2366 sum3 = t[row+2]; 2367 for(n = 0; n<sz-1; n+=2) { 2368 i1 = idx[0]; 2369 i2 = idx[1]; 2370 idx += 2; 2371 tmp0 = t[i1]; 2372 tmp1 = t[i2]; 2373 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2374 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2375 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2376 } 2377 2378 if (n == sz-1){ 2379 tmp0 = t[*idx]; 2380 sum1 -= v1[0] * tmp0; 2381 sum2 -= v2[0] * tmp0; 2382 sum3 -= v3[0] * tmp0; 2383 } 2384 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2385 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2386 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2387 ibdiag += 9; row += 3; 2388 break; 2389 case 4: 2390 v2 = a->a + ii[row+1]; 2391 v3 = a->a + ii[row+2]; 2392 v4 = a->a + ii[row+3]; 2393 sum1 = t[row]; 2394 sum2 = t[row+1]; 2395 sum3 = t[row+2]; 2396 sum4 = t[row+3]; 2397 for(n = 0; n<sz-1; n+=2) { 2398 i1 = idx[0]; 2399 i2 = idx[1]; 2400 idx += 2; 2401 tmp0 = t[i1]; 2402 tmp1 = t[i2]; 2403 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2404 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2405 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2406 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2407 } 2408 2409 if (n == sz-1){ 2410 tmp0 = t[*idx]; 2411 sum1 -= v1[0] * tmp0; 2412 sum2 -= v2[0] * tmp0; 2413 sum3 -= v3[0] * tmp0; 2414 sum4 -= v4[0] * tmp0; 2415 } 2416 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2417 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2418 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2419 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2420 ibdiag += 16; row += 4; 2421 break; 2422 case 5: 2423 v2 = a->a + ii[row+1]; 2424 v3 = a->a + ii[row+2]; 2425 v4 = a->a + ii[row+3]; 2426 v5 = a->a + ii[row+4]; 2427 sum1 = t[row]; 2428 sum2 = t[row+1]; 2429 sum3 = t[row+2]; 2430 sum4 = t[row+3]; 2431 sum5 = t[row+4]; 2432 for(n = 0; n<sz-1; n+=2) { 2433 i1 = idx[0]; 2434 i2 = idx[1]; 2435 idx += 2; 2436 tmp0 = t[i1]; 2437 tmp1 = t[i2]; 2438 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2439 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2440 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2441 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2442 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2443 } 2444 2445 if (n == sz-1){ 2446 tmp0 = t[*idx]; 2447 sum1 -= v1[0] * tmp0; 2448 sum2 -= v2[0] * tmp0; 2449 sum3 -= v3[0] * tmp0; 2450 sum4 -= v4[0] * tmp0; 2451 sum5 -= v5[0] * tmp0; 2452 } 2453 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2454 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2455 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2456 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2457 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2458 ibdiag += 25; row += 5; 2459 break; 2460 default: 2461 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2462 } 2463 CHKMEMQ; 2464 } 2465 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2466 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2467 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2468 } 2469 PetscFunctionReturn(0); 2470 } 2471 2472 #undef __FUNCT__ 2473 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 2474 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2475 { 2476 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2477 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2478 const MatScalar *bdiag = a->inode.bdiag; 2479 const PetscScalar *b; 2480 PetscErrorCode ierr; 2481 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2482 const PetscInt *sizes = a->inode.size; 2483 2484 PetscFunctionBegin; 2485 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2486 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2487 cnt = 0; 2488 for (i=0, row=0; i<m; i++) { 2489 switch (sizes[i]){ 2490 case 1: 2491 x[row] = b[row]*bdiag[cnt++];row++; 2492 break; 2493 case 2: 2494 x1 = b[row]; x2 = b[row+1]; 2495 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2496 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2497 x[row++] = tmp1; 2498 x[row++] = tmp2; 2499 cnt += 4; 2500 break; 2501 case 3: 2502 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2503 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2504 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2505 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2506 x[row++] = tmp1; 2507 x[row++] = tmp2; 2508 x[row++] = tmp3; 2509 cnt += 9; 2510 break; 2511 case 4: 2512 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2513 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2514 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2515 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2516 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2517 x[row++] = tmp1; 2518 x[row++] = tmp2; 2519 x[row++] = tmp3; 2520 x[row++] = tmp4; 2521 cnt += 16; 2522 break; 2523 case 5: 2524 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2525 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2526 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2527 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2528 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2529 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2530 x[row++] = tmp1; 2531 x[row++] = tmp2; 2532 x[row++] = tmp3; 2533 x[row++] = tmp4; 2534 x[row++] = tmp5; 2535 cnt += 25; 2536 break; 2537 default: 2538 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2539 } 2540 } 2541 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2542 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2543 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2544 PetscFunctionReturn(0); 2545 } 2546 2547 /* 2548 samestructure indicates that the matrix has not changed its nonzero structure so we 2549 do not need to recompute the inodes 2550 */ 2551 #undef __FUNCT__ 2552 #define __FUNCT__ "Mat_CheckInode" 2553 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2554 { 2555 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2556 PetscErrorCode ierr; 2557 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2558 PetscTruth flag; 2559 2560 PetscFunctionBegin; 2561 if (!a->inode.use) PetscFunctionReturn(0); 2562 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2563 2564 2565 m = A->rmap->n; 2566 if (a->inode.size) {ns = a->inode.size;} 2567 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2568 2569 i = 0; 2570 node_count = 0; 2571 idx = a->j; 2572 ii = a->i; 2573 while (i < m){ /* For each row */ 2574 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2575 /* Limits the number of elements in a node to 'a->inode.limit' */ 2576 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2577 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2578 if (nzy != nzx) break; 2579 idy += nzx; /* Same nonzero pattern */ 2580 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2581 if (!flag) break; 2582 } 2583 ns[node_count++] = blk_size; 2584 idx += blk_size*nzx; 2585 i = j; 2586 } 2587 /* If not enough inodes found,, do not use inode version of the routines */ 2588 if (!a->inode.size && m && node_count > .9*m) { 2589 ierr = PetscFree(ns);CHKERRQ(ierr); 2590 a->inode.node_count = 0; 2591 a->inode.size = PETSC_NULL; 2592 a->inode.use = PETSC_FALSE; 2593 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2594 } else { 2595 A->ops->mult = MatMult_SeqAIJ_Inode; 2596 A->ops->sor = MatSOR_SeqAIJ_Inode; 2597 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 2598 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 2599 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 2600 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 2601 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 2602 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 2603 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 2604 a->inode.node_count = node_count; 2605 a->inode.size = ns; 2606 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2607 } 2608 PetscFunctionReturn(0); 2609 } 2610 2611 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 2612 PetscInt __k, *__vi; \ 2613 __vi = aj + ai[row]; \ 2614 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 2615 __vi = aj + adiag[row]; \ 2616 cols[nzl] = __vi[0];\ 2617 __vi = aj + adiag[row+1]+1;\ 2618 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 2619 2620 #undef __FUNCT__ 2621 #define __FUNCT__ "Mat_CheckInode_FactorLU" 2622 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 2623 { 2624 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2625 PetscErrorCode ierr; 2626 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 2627 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 2628 PetscTruth flag; 2629 2630 PetscFunctionBegin; 2631 if (!a->inode.use) PetscFunctionReturn(0); 2632 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2633 2634 m = A->rmap->n; 2635 if (a->inode.size) {ns = a->inode.size;} 2636 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2637 2638 i = 0; 2639 node_count = 0; 2640 while (i < m){ /* For each row */ 2641 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 2642 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 2643 nzx = nzl1 + nzu1 + 1; 2644 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 2645 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 2646 2647 /* Limits the number of elements in a node to 'a->inode.limit' */ 2648 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2649 nzl2 = ai[j+1] - ai[j]; 2650 nzu2 = adiag[j] - adiag[j+1] - 1; 2651 nzy = nzl2 + nzu2 + 1; 2652 if( nzy != nzx) break; 2653 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 2654 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 2655 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2656 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 2657 ierr = PetscFree(cols2);CHKERRQ(ierr); 2658 } 2659 ns[node_count++] = blk_size; 2660 ierr = PetscFree(cols1);CHKERRQ(ierr); 2661 i = j; 2662 } 2663 /* If not enough inodes found,, do not use inode version of the routines */ 2664 if (!a->inode.size && m && node_count > .9*m) { 2665 ierr = PetscFree(ns);CHKERRQ(ierr); 2666 a->inode.node_count = 0; 2667 a->inode.size = PETSC_NULL; 2668 a->inode.use = PETSC_FALSE; 2669 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2670 } else { 2671 A->ops->mult = 0; 2672 A->ops->sor = 0; 2673 A->ops->multadd = 0; 2674 A->ops->getrowij = 0; 2675 A->ops->restorerowij = 0; 2676 A->ops->getcolumnij = 0; 2677 A->ops->restorecolumnij = 0; 2678 A->ops->coloringpatch = 0; 2679 A->ops->multdiagonalblock = 0; 2680 a->inode.node_count = node_count; 2681 a->inode.size = ns; 2682 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2683 } 2684 PetscFunctionReturn(0); 2685 } 2686 2687 /* 2688 This is really ugly. if inodes are used this replaces the 2689 permutations with ones that correspond to rows/cols of the matrix 2690 rather then inode blocks 2691 */ 2692 #undef __FUNCT__ 2693 #define __FUNCT__ "MatInodeAdjustForInodes" 2694 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2695 { 2696 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2697 2698 PetscFunctionBegin; 2699 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2700 if (f) { 2701 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2702 } 2703 PetscFunctionReturn(0); 2704 } 2705 2706 EXTERN_C_BEGIN 2707 #undef __FUNCT__ 2708 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 2709 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 2710 { 2711 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2712 PetscErrorCode ierr; 2713 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2714 const PetscInt *ridx,*cidx; 2715 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2716 PetscInt nslim_col,*ns_col; 2717 IS ris = *rperm,cis = *cperm; 2718 2719 PetscFunctionBegin; 2720 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2721 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2722 2723 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2724 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2725 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 2726 2727 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2728 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2729 2730 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2731 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2732 2733 /* Construct the permutations for rows*/ 2734 for (i=0,row = 0; i<nslim_row; ++i){ 2735 indx = ridx[i]; 2736 start_val = tns[indx]; 2737 end_val = tns[indx + 1]; 2738 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2739 } 2740 2741 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2742 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2743 2744 /* Construct permutations for columns */ 2745 for (i=0,col=0; i<nslim_col; ++i){ 2746 indx = cidx[i]; 2747 start_val = tns[indx]; 2748 end_val = tns[indx + 1]; 2749 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2750 } 2751 2752 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2753 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 2754 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2755 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 2756 2757 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2758 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2759 2760 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2761 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 2762 ierr = ISDestroy(cis);CHKERRQ(ierr); 2763 ierr = ISDestroy(ris);CHKERRQ(ierr); 2764 ierr = PetscFree(tns);CHKERRQ(ierr); 2765 PetscFunctionReturn(0); 2766 } 2767 EXTERN_C_END 2768 2769 #undef __FUNCT__ 2770 #define __FUNCT__ "MatInodeGetInodeSizes" 2771 /*@C 2772 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2773 2774 Collective on Mat 2775 2776 Input Parameter: 2777 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2778 2779 Output Parameter: 2780 + node_count - no of inodes present in the matrix. 2781 . sizes - an array of size node_count,with sizes of each inode. 2782 - limit - the max size used to generate the inodes. 2783 2784 Level: advanced 2785 2786 Notes: This routine returns some internal storage information 2787 of the matrix, it is intended to be used by advanced users. 2788 It should be called after the matrix is assembled. 2789 The contents of the sizes[] array should not be changed. 2790 PETSC_NULL may be passed for information not requested. 2791 2792 .keywords: matrix, seqaij, get, inode 2793 2794 .seealso: MatGetInfo() 2795 @*/ 2796 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2797 { 2798 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2799 2800 PetscFunctionBegin; 2801 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2802 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2803 if (f) { 2804 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2805 } 2806 PetscFunctionReturn(0); 2807 } 2808 2809 EXTERN_C_BEGIN 2810 #undef __FUNCT__ 2811 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 2812 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2813 { 2814 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2815 2816 PetscFunctionBegin; 2817 if (node_count) *node_count = a->inode.node_count; 2818 if (sizes) *sizes = a->inode.size; 2819 if (limit) *limit = a->inode.limit; 2820 PetscFunctionReturn(0); 2821 } 2822 EXTERN_C_END 2823