1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 781 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 /* ----------------------------------------------------------- */ 1182 #undef __FUNCT__ 1183 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_newdatastruct" 1184 PetscErrorCode MatSolve_SeqAIJ_Inode_newdatastruct(Mat A,Vec bb,Vec xx) 1185 { 1186 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1187 IS iscol = a->col,isrow = a->row; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*c,*rout,*cout; 1190 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 1191 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 1192 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 1193 PetscScalar sum1,sum2,sum3,sum4,sum5; 1194 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 1195 const PetscScalar *b; 1196 1197 PetscFunctionBegin; 1198 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 1199 node_max = a->inode.node_count; 1200 ns = a->inode.size; /* Node Size array */ 1201 1202 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1203 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1204 tmp = a->solve_work; 1205 1206 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1207 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1208 1209 /* forward solve the lower triangular */ 1210 tmps = tmp ; 1211 aa = a_a ; 1212 aj = a_j ; 1213 ad = a->diag; 1214 1215 for (i = 0,row = 0; i< node_max; ++i){ 1216 nsz = ns[i]; 1217 aii = ai[row]; 1218 v1 = aa + aii; 1219 vi = aj + aii; 1220 nz = ai[row+1]- ai[row]; 1221 1222 switch (nsz){ /* Each loop in 'case' is unrolled */ 1223 case 1 : 1224 sum1 = b[r[row]]; 1225 for(j=0; j<nz-1; j+=2){ 1226 i0 = vi[j]; 1227 i1 = vi[j+1]; 1228 tmp0 = tmps[i0]; 1229 tmp1 = tmps[i1]; 1230 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 1231 } 1232 if(j == nz-1){ 1233 tmp0 = tmps[vi[j]]; 1234 sum1 -= v1[j]*tmp0; 1235 } 1236 tmp[row++]=sum1; 1237 break; 1238 case 2: 1239 sum1 = b[r[row]]; 1240 sum2 = b[r[row+1]]; 1241 v2 = aa + ai[row+1]; 1242 1243 for(j=0; j<nz-1; j+=2){ 1244 i0 = vi[j]; 1245 i1 = vi[j+1]; 1246 tmp0 = tmps[i0]; 1247 tmp1 = tmps[i1]; 1248 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1249 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1250 } 1251 if(j == nz-1){ 1252 tmp0 = tmps[vi[j]]; 1253 sum1 -= v1[j] *tmp0; 1254 sum2 -= v2[j] *tmp0; 1255 } 1256 sum2 -= v2[nz] * sum1; 1257 tmp[row ++]=sum1; 1258 tmp[row ++]=sum2; 1259 break; 1260 case 3: 1261 sum1 = b[r[row]]; 1262 sum2 = b[r[row+1]]; 1263 sum3 = b[r[row+2]]; 1264 v2 = aa + ai[row+1]; 1265 v3 = aa + ai[row+2]; 1266 1267 for (j=0; j<nz-1; j+=2){ 1268 i0 = vi[j]; 1269 i1 = vi[j+1]; 1270 tmp0 = tmps[i0]; 1271 tmp1 = tmps[i1]; 1272 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1273 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1274 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1275 } 1276 if (j == nz-1){ 1277 tmp0 = tmps[vi[j]]; 1278 sum1 -= v1[j] *tmp0; 1279 sum2 -= v2[j] *tmp0; 1280 sum3 -= v3[j] *tmp0; 1281 } 1282 sum2 -= v2[nz] * sum1; 1283 sum3 -= v3[nz] * sum1; 1284 sum3 -= v3[nz+1] * sum2; 1285 tmp[row ++]=sum1; 1286 tmp[row ++]=sum2; 1287 tmp[row ++]=sum3; 1288 break; 1289 1290 case 4: 1291 sum1 = b[r[row]]; 1292 sum2 = b[r[row+1]]; 1293 sum3 = b[r[row+2]]; 1294 sum4 = b[r[row+3]]; 1295 v2 = aa + ai[row+1]; 1296 v3 = aa + ai[row+2]; 1297 v4 = aa + ai[row+3]; 1298 1299 for (j=0; j<nz-1; j+=2){ 1300 i0 = vi[j]; 1301 i1 = vi[j+1]; 1302 tmp0 = tmps[i0]; 1303 tmp1 = tmps[i1]; 1304 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1305 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1306 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1307 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1308 } 1309 if (j == nz-1){ 1310 tmp0 = tmps[vi[j]]; 1311 sum1 -= v1[j] *tmp0; 1312 sum2 -= v2[j] *tmp0; 1313 sum3 -= v3[j] *tmp0; 1314 sum4 -= v4[j] *tmp0; 1315 } 1316 sum2 -= v2[nz] * sum1; 1317 sum3 -= v3[nz] * sum1; 1318 sum4 -= v4[nz] * sum1; 1319 sum3 -= v3[nz+1] * sum2; 1320 sum4 -= v4[nz+1] * sum2; 1321 sum4 -= v4[nz+2] * sum3; 1322 1323 tmp[row ++]=sum1; 1324 tmp[row ++]=sum2; 1325 tmp[row ++]=sum3; 1326 tmp[row ++]=sum4; 1327 break; 1328 case 5: 1329 sum1 = b[r[row]]; 1330 sum2 = b[r[row+1]]; 1331 sum3 = b[r[row+2]]; 1332 sum4 = b[r[row+3]]; 1333 sum5 = b[r[row+4]]; 1334 v2 = aa + ai[row+1]; 1335 v3 = aa + ai[row+2]; 1336 v4 = aa + ai[row+3]; 1337 v5 = aa + ai[row+4]; 1338 1339 for (j=0; j<nz-1; j+=2){ 1340 i0 = vi[j]; 1341 i1 = vi[j+1]; 1342 tmp0 = tmps[i0]; 1343 tmp1 = tmps[i1]; 1344 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1345 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1346 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1347 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1348 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 1349 } 1350 if (j == nz-1){ 1351 tmp0 = tmps[vi[j]]; 1352 sum1 -= v1[j] *tmp0; 1353 sum2 -= v2[j] *tmp0; 1354 sum3 -= v3[j] *tmp0; 1355 sum4 -= v4[j] *tmp0; 1356 sum5 -= v5[j] *tmp0; 1357 } 1358 1359 sum2 -= v2[nz] * sum1; 1360 sum3 -= v3[nz] * sum1; 1361 sum4 -= v4[nz] * sum1; 1362 sum5 -= v5[nz] * sum1; 1363 sum3 -= v3[nz+1] * sum2; 1364 sum4 -= v4[nz+1] * sum2; 1365 sum5 -= v5[nz+1] * sum2; 1366 sum4 -= v4[nz+2] * sum3; 1367 sum5 -= v5[nz+2] * sum3; 1368 sum5 -= v5[nz+3] * sum4; 1369 1370 tmp[row ++]=sum1; 1371 tmp[row ++]=sum2; 1372 tmp[row ++]=sum3; 1373 tmp[row ++]=sum4; 1374 tmp[row ++]=sum5; 1375 break; 1376 default: 1377 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1378 } 1379 } 1380 /* backward solve the upper triangular */ 1381 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 1382 nsz = ns[i]; 1383 aii = ad[row+1] + 1; 1384 v1 = aa + aii; 1385 vi = aj + aii; 1386 nz = ad[row]- ad[row+1] - 1; 1387 switch (nsz){ /* Each loop in 'case' is unrolled */ 1388 case 1 : 1389 sum1 = tmp[row]; 1390 1391 for(j=0 ; j<nz-1; j+=2){ 1392 i0 = vi[j]; 1393 i1 = vi[j+1]; 1394 tmp0 = tmps[i0]; 1395 tmp1 = tmps[i1]; 1396 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1397 } 1398 if (j == nz-1){ 1399 tmp0 = tmps[vi[j]]; 1400 sum1 -= v1[j]*tmp0; 1401 } 1402 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1403 break; 1404 case 2 : 1405 sum1 = tmp[row]; 1406 sum2 = tmp[row-1]; 1407 v2 = aa + ad[row] + 1; 1408 for (j=0 ; j<nz-1; j+=2){ 1409 i0 = vi[j]; 1410 i1 = vi[j+1]; 1411 tmp0 = tmps[i0]; 1412 tmp1 = tmps[i1]; 1413 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1414 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1415 } 1416 if (j == nz-1){ 1417 tmp0 = tmps[vi[j]]; 1418 sum1 -= v1[j]* tmp0; 1419 sum2 -= v2[j+1]* tmp0; 1420 } 1421 1422 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1423 sum2 -= v2[0] * tmp0; 1424 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1425 break; 1426 case 3 : 1427 sum1 = tmp[row]; 1428 sum2 = tmp[row -1]; 1429 sum3 = tmp[row -2]; 1430 v2 = aa + ad[row] + 1; 1431 v3 = aa + ad[row -1] + 1; 1432 for (j=0 ; j<nz-1; j+=2){ 1433 i0 = vi[j]; 1434 i1 = vi[j+1]; 1435 tmp0 = tmps[i0]; 1436 tmp1 = tmps[i1]; 1437 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1438 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1439 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1440 } 1441 if (j== nz-1){ 1442 tmp0 = tmps[vi[j]]; 1443 sum1 -= v1[j] * tmp0; 1444 sum2 -= v2[j+1] * tmp0; 1445 sum3 -= v3[j+2] * tmp0; 1446 } 1447 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1448 sum2 -= v2[0]* tmp0; 1449 sum3 -= v3[1] * tmp0; 1450 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1451 sum3 -= v3[0]* tmp0; 1452 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1453 1454 break; 1455 case 4 : 1456 sum1 = tmp[row]; 1457 sum2 = tmp[row -1]; 1458 sum3 = tmp[row -2]; 1459 sum4 = tmp[row -3]; 1460 v2 = aa + ad[row]+1; 1461 v3 = aa + ad[row -1]+1; 1462 v4 = aa + ad[row -2]+1; 1463 1464 for (j=0 ; j<nz-1; j+=2){ 1465 i0 = vi[j]; 1466 i1 = vi[j+1]; 1467 tmp0 = tmps[i0]; 1468 tmp1 = tmps[i1]; 1469 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1470 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1471 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1472 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1473 } 1474 if (j== nz-1){ 1475 tmp0 = tmps[vi[j]]; 1476 sum1 -= v1[j] * tmp0; 1477 sum2 -= v2[j+1] * tmp0; 1478 sum3 -= v3[j+2] * tmp0; 1479 sum4 -= v4[j+3] * tmp0; 1480 } 1481 1482 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1483 sum2 -= v2[0] * tmp0; 1484 sum3 -= v3[1] * tmp0; 1485 sum4 -= v4[2] * tmp0; 1486 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1487 sum3 -= v3[0] * tmp0; 1488 sum4 -= v4[1] * tmp0; 1489 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1490 sum4 -= v4[0] * tmp0; 1491 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1492 break; 1493 case 5 : 1494 sum1 = tmp[row]; 1495 sum2 = tmp[row -1]; 1496 sum3 = tmp[row -2]; 1497 sum4 = tmp[row -3]; 1498 sum5 = tmp[row -4]; 1499 v2 = aa + ad[row]+1; 1500 v3 = aa + ad[row -1]+1; 1501 v4 = aa + ad[row -2]+1; 1502 v5 = aa + ad[row -3]+1; 1503 for (j=0 ; j<nz-1; j+=2){ 1504 i0 = vi[j]; 1505 i1 = vi[j+1]; 1506 tmp0 = tmps[i0]; 1507 tmp1 = tmps[i1]; 1508 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1509 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1510 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1511 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1512 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 1513 } 1514 if (j==nz-1){ 1515 tmp0 = tmps[vi[j]]; 1516 sum1 -= v1[j] * tmp0; 1517 sum2 -= v2[j+1] * tmp0; 1518 sum3 -= v3[j+2] * tmp0; 1519 sum4 -= v4[j+3] * tmp0; 1520 sum5 -= v5[j+4] * tmp0; 1521 } 1522 1523 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1524 sum2 -= v2[0] * tmp0; 1525 sum3 -= v3[1] * tmp0; 1526 sum4 -= v4[2] * tmp0; 1527 sum5 -= v5[3] * tmp0; 1528 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1529 sum3 -= v3[0] * tmp0; 1530 sum4 -= v4[1] * tmp0; 1531 sum5 -= v5[2] * tmp0; 1532 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1533 sum4 -= v4[0] * tmp0; 1534 sum5 -= v5[1] * tmp0; 1535 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1536 sum5 -= v5[0] * tmp0; 1537 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 1538 break; 1539 default: 1540 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1541 } 1542 } 1543 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1544 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1545 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1546 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1547 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 1552 /* 1553 Makes a longer coloring[] array and calls the usual code with that 1554 */ 1555 #undef __FUNCT__ 1556 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 1557 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1558 { 1559 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1560 PetscErrorCode ierr; 1561 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1562 PetscInt *colorused,i; 1563 ISColoringValue *newcolor; 1564 1565 PetscFunctionBegin; 1566 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1567 /* loop over inodes, marking a color for each column*/ 1568 row = 0; 1569 for (i=0; i<m; i++){ 1570 for (j=0; j<ns[i]; j++) { 1571 newcolor[row++] = coloring[i] + j*ncolors; 1572 } 1573 } 1574 1575 /* eliminate unneeded colors */ 1576 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1577 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1578 for (i=0; i<n; i++) { 1579 colorused[newcolor[i]] = 1; 1580 } 1581 1582 for (i=1; i<5*ncolors; i++) { 1583 colorused[i] += colorused[i-1]; 1584 } 1585 ncolors = colorused[5*ncolors-1]; 1586 for (i=0; i<n; i++) { 1587 newcolor[i] = colorused[newcolor[i]]-1; 1588 } 1589 ierr = PetscFree(colorused);CHKERRQ(ierr); 1590 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1591 ierr = PetscFree(coloring);CHKERRQ(ierr); 1592 PetscFunctionReturn(0); 1593 } 1594 1595 #include "../src/mat/blockinvert.h" 1596 1597 #undef __FUNCT__ 1598 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 1599 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1600 { 1601 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1602 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1603 MatScalar *ibdiag,*bdiag; 1604 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1605 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1606 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1607 PetscErrorCode ierr; 1608 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1609 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1610 1611 PetscFunctionBegin; 1612 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1613 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1614 if (its > 1) { 1615 /* switch to non-inode version */ 1616 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1617 PetscFunctionReturn(0); 1618 } 1619 1620 if (!a->inode.ibdiagvalid) { 1621 if (!a->inode.ibdiag) { 1622 /* calculate space needed for diagonal blocks */ 1623 for (i=0; i<m; i++) { 1624 cnt += sizes[i]*sizes[i]; 1625 } 1626 a->inode.bdiagsize = cnt; 1627 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1628 } 1629 1630 /* copy over the diagonal blocks and invert them */ 1631 ibdiag = a->inode.ibdiag; 1632 bdiag = a->inode.bdiag; 1633 cnt = 0; 1634 for (i=0, row = 0; i<m; i++) { 1635 for (j=0; j<sizes[i]; j++) { 1636 for (k=0; k<sizes[i]; k++) { 1637 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1638 } 1639 } 1640 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1641 1642 switch(sizes[i]) { 1643 case 1: 1644 /* Create matrix data structure */ 1645 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1646 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1647 break; 1648 case 2: 1649 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1650 break; 1651 case 3: 1652 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1653 break; 1654 case 4: 1655 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1656 break; 1657 case 5: 1658 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1659 break; 1660 default: 1661 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1662 } 1663 cnt += sizes[i]*sizes[i]; 1664 row += sizes[i]; 1665 } 1666 a->inode.ibdiagvalid = PETSC_TRUE; 1667 } 1668 ibdiag = a->inode.ibdiag; 1669 bdiag = a->inode.bdiag; 1670 1671 1672 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1673 if (flag & SOR_ZERO_INITIAL_GUESS) { 1674 PetscScalar *b; 1675 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1676 if (xx != bb) { 1677 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1678 } else { 1679 b = x; 1680 } 1681 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1682 1683 for (i=0, row=0; i<m; i++) { 1684 sz = diag[row] - ii[row]; 1685 v1 = a->a + ii[row]; 1686 idx = a->j + ii[row]; 1687 1688 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1689 switch (sizes[i]){ 1690 case 1: 1691 1692 sum1 = b[row]; 1693 for(n = 0; n<sz-1; n+=2) { 1694 i1 = idx[0]; 1695 i2 = idx[1]; 1696 idx += 2; 1697 tmp0 = x[i1]; 1698 tmp1 = x[i2]; 1699 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1700 } 1701 1702 if (n == sz-1){ 1703 tmp0 = x[*idx]; 1704 sum1 -= *v1 * tmp0; 1705 } 1706 x[row++] = sum1*(*ibdiag++); 1707 break; 1708 case 2: 1709 v2 = a->a + ii[row+1]; 1710 sum1 = b[row]; 1711 sum2 = b[row+1]; 1712 for(n = 0; n<sz-1; n+=2) { 1713 i1 = idx[0]; 1714 i2 = idx[1]; 1715 idx += 2; 1716 tmp0 = x[i1]; 1717 tmp1 = x[i2]; 1718 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1719 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1720 } 1721 1722 if (n == sz-1){ 1723 tmp0 = x[*idx]; 1724 sum1 -= v1[0] * tmp0; 1725 sum2 -= v2[0] * tmp0; 1726 } 1727 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1728 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1729 ibdiag += 4; 1730 break; 1731 case 3: 1732 v2 = a->a + ii[row+1]; 1733 v3 = a->a + ii[row+2]; 1734 sum1 = b[row]; 1735 sum2 = b[row+1]; 1736 sum3 = b[row+2]; 1737 for(n = 0; n<sz-1; n+=2) { 1738 i1 = idx[0]; 1739 i2 = idx[1]; 1740 idx += 2; 1741 tmp0 = x[i1]; 1742 tmp1 = x[i2]; 1743 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1744 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1745 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1746 } 1747 1748 if (n == sz-1){ 1749 tmp0 = x[*idx]; 1750 sum1 -= v1[0] * tmp0; 1751 sum2 -= v2[0] * tmp0; 1752 sum3 -= v3[0] * tmp0; 1753 } 1754 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1755 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1756 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1757 ibdiag += 9; 1758 break; 1759 case 4: 1760 v2 = a->a + ii[row+1]; 1761 v3 = a->a + ii[row+2]; 1762 v4 = a->a + ii[row+3]; 1763 sum1 = b[row]; 1764 sum2 = b[row+1]; 1765 sum3 = b[row+2]; 1766 sum4 = b[row+3]; 1767 for(n = 0; n<sz-1; n+=2) { 1768 i1 = idx[0]; 1769 i2 = idx[1]; 1770 idx += 2; 1771 tmp0 = x[i1]; 1772 tmp1 = x[i2]; 1773 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1774 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1775 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1776 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1777 } 1778 1779 if (n == sz-1){ 1780 tmp0 = x[*idx]; 1781 sum1 -= v1[0] * tmp0; 1782 sum2 -= v2[0] * tmp0; 1783 sum3 -= v3[0] * tmp0; 1784 sum4 -= v4[0] * tmp0; 1785 } 1786 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1787 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1788 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1789 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1790 ibdiag += 16; 1791 break; 1792 case 5: 1793 v2 = a->a + ii[row+1]; 1794 v3 = a->a + ii[row+2]; 1795 v4 = a->a + ii[row+3]; 1796 v5 = a->a + ii[row+4]; 1797 sum1 = b[row]; 1798 sum2 = b[row+1]; 1799 sum3 = b[row+2]; 1800 sum4 = b[row+3]; 1801 sum5 = b[row+4]; 1802 for(n = 0; n<sz-1; n+=2) { 1803 i1 = idx[0]; 1804 i2 = idx[1]; 1805 idx += 2; 1806 tmp0 = x[i1]; 1807 tmp1 = x[i2]; 1808 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1809 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1810 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1811 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1812 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1813 } 1814 1815 if (n == sz-1){ 1816 tmp0 = x[*idx]; 1817 sum1 -= v1[0] * tmp0; 1818 sum2 -= v2[0] * tmp0; 1819 sum3 -= v3[0] * tmp0; 1820 sum4 -= v4[0] * tmp0; 1821 sum5 -= v5[0] * tmp0; 1822 } 1823 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1824 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1825 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1826 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1827 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1828 ibdiag += 25; 1829 break; 1830 default: 1831 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1832 } 1833 } 1834 1835 xb = x; 1836 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1837 } else xb = b; 1838 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1839 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1840 cnt = 0; 1841 for (i=0, row=0; i<m; i++) { 1842 1843 switch (sizes[i]){ 1844 case 1: 1845 x[row++] *= bdiag[cnt++]; 1846 break; 1847 case 2: 1848 x1 = x[row]; x2 = x[row+1]; 1849 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1850 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1851 x[row++] = tmp1; 1852 x[row++] = tmp2; 1853 cnt += 4; 1854 break; 1855 case 3: 1856 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1857 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1858 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1859 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1860 x[row++] = tmp1; 1861 x[row++] = tmp2; 1862 x[row++] = tmp3; 1863 cnt += 9; 1864 break; 1865 case 4: 1866 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1867 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1868 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1869 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1870 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1871 x[row++] = tmp1; 1872 x[row++] = tmp2; 1873 x[row++] = tmp3; 1874 x[row++] = tmp4; 1875 cnt += 16; 1876 break; 1877 case 5: 1878 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1879 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1880 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1881 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1882 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1883 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1884 x[row++] = tmp1; 1885 x[row++] = tmp2; 1886 x[row++] = tmp3; 1887 x[row++] = tmp4; 1888 x[row++] = tmp5; 1889 cnt += 25; 1890 break; 1891 default: 1892 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1893 } 1894 } 1895 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1896 } 1897 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1898 1899 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1900 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1901 ibdiag -= sizes[i]*sizes[i]; 1902 sz = ii[row+1] - diag[row] - 1; 1903 v1 = a->a + diag[row] + 1; 1904 idx = a->j + diag[row] + 1; 1905 1906 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1907 switch (sizes[i]){ 1908 case 1: 1909 1910 sum1 = xb[row]; 1911 for(n = 0; n<sz-1; n+=2) { 1912 i1 = idx[0]; 1913 i2 = idx[1]; 1914 idx += 2; 1915 tmp0 = x[i1]; 1916 tmp1 = x[i2]; 1917 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1918 } 1919 1920 if (n == sz-1){ 1921 tmp0 = x[*idx]; 1922 sum1 -= *v1*tmp0; 1923 } 1924 x[row--] = sum1*(*ibdiag); 1925 break; 1926 1927 case 2: 1928 1929 sum1 = xb[row]; 1930 sum2 = xb[row-1]; 1931 /* note that sum1 is associated with the second of the two rows */ 1932 v2 = a->a + diag[row-1] + 2; 1933 for(n = 0; n<sz-1; n+=2) { 1934 i1 = idx[0]; 1935 i2 = idx[1]; 1936 idx += 2; 1937 tmp0 = x[i1]; 1938 tmp1 = x[i2]; 1939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1941 } 1942 1943 if (n == sz-1){ 1944 tmp0 = x[*idx]; 1945 sum1 -= *v1*tmp0; 1946 sum2 -= *v2*tmp0; 1947 } 1948 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1949 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1950 break; 1951 case 3: 1952 1953 sum1 = xb[row]; 1954 sum2 = xb[row-1]; 1955 sum3 = xb[row-2]; 1956 v2 = a->a + diag[row-1] + 2; 1957 v3 = a->a + diag[row-2] + 3; 1958 for(n = 0; n<sz-1; n+=2) { 1959 i1 = idx[0]; 1960 i2 = idx[1]; 1961 idx += 2; 1962 tmp0 = x[i1]; 1963 tmp1 = x[i2]; 1964 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1965 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1966 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1967 } 1968 1969 if (n == sz-1){ 1970 tmp0 = x[*idx]; 1971 sum1 -= *v1*tmp0; 1972 sum2 -= *v2*tmp0; 1973 sum3 -= *v3*tmp0; 1974 } 1975 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1976 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1977 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1978 break; 1979 case 4: 1980 1981 sum1 = xb[row]; 1982 sum2 = xb[row-1]; 1983 sum3 = xb[row-2]; 1984 sum4 = xb[row-3]; 1985 v2 = a->a + diag[row-1] + 2; 1986 v3 = a->a + diag[row-2] + 3; 1987 v4 = a->a + diag[row-3] + 4; 1988 for(n = 0; n<sz-1; n+=2) { 1989 i1 = idx[0]; 1990 i2 = idx[1]; 1991 idx += 2; 1992 tmp0 = x[i1]; 1993 tmp1 = x[i2]; 1994 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1995 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1996 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1997 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1998 } 1999 2000 if (n == sz-1){ 2001 tmp0 = x[*idx]; 2002 sum1 -= *v1*tmp0; 2003 sum2 -= *v2*tmp0; 2004 sum3 -= *v3*tmp0; 2005 sum4 -= *v4*tmp0; 2006 } 2007 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2008 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2009 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2010 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2011 break; 2012 case 5: 2013 2014 sum1 = xb[row]; 2015 sum2 = xb[row-1]; 2016 sum3 = xb[row-2]; 2017 sum4 = xb[row-3]; 2018 sum5 = xb[row-4]; 2019 v2 = a->a + diag[row-1] + 2; 2020 v3 = a->a + diag[row-2] + 3; 2021 v4 = a->a + diag[row-3] + 4; 2022 v5 = a->a + diag[row-4] + 5; 2023 for(n = 0; n<sz-1; n+=2) { 2024 i1 = idx[0]; 2025 i2 = idx[1]; 2026 idx += 2; 2027 tmp0 = x[i1]; 2028 tmp1 = x[i2]; 2029 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2030 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2031 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2032 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2033 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2034 } 2035 2036 if (n == sz-1){ 2037 tmp0 = x[*idx]; 2038 sum1 -= *v1*tmp0; 2039 sum2 -= *v2*tmp0; 2040 sum3 -= *v3*tmp0; 2041 sum4 -= *v4*tmp0; 2042 sum5 -= *v5*tmp0; 2043 } 2044 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2045 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2046 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2047 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2048 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2049 break; 2050 default: 2051 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2052 } 2053 } 2054 2055 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2056 } 2057 its--; 2058 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2059 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2060 } 2061 if (flag & SOR_EISENSTAT) { 2062 const PetscScalar *b; 2063 MatScalar *t = a->inode.ssor_work; 2064 2065 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2066 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2067 /* 2068 Apply (U + D)^-1 where D is now the block diagonal 2069 */ 2070 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2071 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2072 ibdiag -= sizes[i]*sizes[i]; 2073 sz = ii[row+1] - diag[row] - 1; 2074 v1 = a->a + diag[row] + 1; 2075 idx = a->j + diag[row] + 1; 2076 CHKMEMQ; 2077 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2078 switch (sizes[i]){ 2079 case 1: 2080 2081 sum1 = b[row]; 2082 for(n = 0; n<sz-1; n+=2) { 2083 i1 = idx[0]; 2084 i2 = idx[1]; 2085 idx += 2; 2086 tmp0 = x[i1]; 2087 tmp1 = x[i2]; 2088 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2089 } 2090 2091 if (n == sz-1){ 2092 tmp0 = x[*idx]; 2093 sum1 -= *v1*tmp0; 2094 } 2095 x[row] = sum1*(*ibdiag);row--; 2096 break; 2097 2098 case 2: 2099 2100 sum1 = b[row]; 2101 sum2 = b[row-1]; 2102 /* note that sum1 is associated with the second of the two rows */ 2103 v2 = a->a + diag[row-1] + 2; 2104 for(n = 0; n<sz-1; n+=2) { 2105 i1 = idx[0]; 2106 i2 = idx[1]; 2107 idx += 2; 2108 tmp0 = x[i1]; 2109 tmp1 = x[i2]; 2110 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2111 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2112 } 2113 2114 if (n == sz-1){ 2115 tmp0 = x[*idx]; 2116 sum1 -= *v1*tmp0; 2117 sum2 -= *v2*tmp0; 2118 } 2119 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2120 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2121 row -= 2; 2122 break; 2123 case 3: 2124 2125 sum1 = b[row]; 2126 sum2 = b[row-1]; 2127 sum3 = b[row-2]; 2128 v2 = a->a + diag[row-1] + 2; 2129 v3 = a->a + diag[row-2] + 3; 2130 for(n = 0; n<sz-1; n+=2) { 2131 i1 = idx[0]; 2132 i2 = idx[1]; 2133 idx += 2; 2134 tmp0 = x[i1]; 2135 tmp1 = x[i2]; 2136 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2137 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2138 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2139 } 2140 2141 if (n == sz-1){ 2142 tmp0 = x[*idx]; 2143 sum1 -= *v1*tmp0; 2144 sum2 -= *v2*tmp0; 2145 sum3 -= *v3*tmp0; 2146 } 2147 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2148 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2149 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2150 row -= 3; 2151 break; 2152 case 4: 2153 2154 sum1 = b[row]; 2155 sum2 = b[row-1]; 2156 sum3 = b[row-2]; 2157 sum4 = b[row-3]; 2158 v2 = a->a + diag[row-1] + 2; 2159 v3 = a->a + diag[row-2] + 3; 2160 v4 = a->a + diag[row-3] + 4; 2161 for(n = 0; n<sz-1; n+=2) { 2162 i1 = idx[0]; 2163 i2 = idx[1]; 2164 idx += 2; 2165 tmp0 = x[i1]; 2166 tmp1 = x[i2]; 2167 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2168 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2169 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2170 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2171 } 2172 2173 if (n == sz-1){ 2174 tmp0 = x[*idx]; 2175 sum1 -= *v1*tmp0; 2176 sum2 -= *v2*tmp0; 2177 sum3 -= *v3*tmp0; 2178 sum4 -= *v4*tmp0; 2179 } 2180 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2181 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2182 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2183 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2184 row -= 4; 2185 break; 2186 case 5: 2187 2188 sum1 = b[row]; 2189 sum2 = b[row-1]; 2190 sum3 = b[row-2]; 2191 sum4 = b[row-3]; 2192 sum5 = b[row-4]; 2193 v2 = a->a + diag[row-1] + 2; 2194 v3 = a->a + diag[row-2] + 3; 2195 v4 = a->a + diag[row-3] + 4; 2196 v5 = a->a + diag[row-4] + 5; 2197 for(n = 0; n<sz-1; n+=2) { 2198 i1 = idx[0]; 2199 i2 = idx[1]; 2200 idx += 2; 2201 tmp0 = x[i1]; 2202 tmp1 = x[i2]; 2203 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2204 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2205 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2206 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2207 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2208 } 2209 2210 if (n == sz-1){ 2211 tmp0 = x[*idx]; 2212 sum1 -= *v1*tmp0; 2213 sum2 -= *v2*tmp0; 2214 sum3 -= *v3*tmp0; 2215 sum4 -= *v4*tmp0; 2216 sum5 -= *v5*tmp0; 2217 } 2218 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2219 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2220 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2221 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2222 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2223 row -= 5; 2224 break; 2225 default: 2226 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2227 } 2228 CHKMEMQ; 2229 } 2230 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2231 2232 /* 2233 t = b - D x where D is the block diagonal 2234 */ 2235 cnt = 0; 2236 for (i=0, row=0; i<m; i++) { 2237 CHKMEMQ; 2238 switch (sizes[i]){ 2239 case 1: 2240 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2241 break; 2242 case 2: 2243 x1 = x[row]; x2 = x[row+1]; 2244 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2245 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2246 t[row] = b[row] - tmp1; 2247 t[row+1] = b[row+1] - tmp2; row += 2; 2248 cnt += 4; 2249 break; 2250 case 3: 2251 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2252 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2253 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2254 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2255 t[row] = b[row] - tmp1; 2256 t[row+1] = b[row+1] - tmp2; 2257 t[row+2] = b[row+2] - tmp3; row += 3; 2258 cnt += 9; 2259 break; 2260 case 4: 2261 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2262 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2263 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2264 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2265 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2266 t[row] = b[row] - tmp1; 2267 t[row+1] = b[row+1] - tmp2; 2268 t[row+2] = b[row+2] - tmp3; 2269 t[row+3] = b[row+3] - tmp4; row += 4; 2270 cnt += 16; 2271 break; 2272 case 5: 2273 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2274 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2275 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2276 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2277 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2278 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2279 t[row] = b[row] - tmp1; 2280 t[row+1] = b[row+1] - tmp2; 2281 t[row+2] = b[row+2] - tmp3; 2282 t[row+3] = b[row+3] - tmp4; 2283 t[row+4] = b[row+4] - tmp5;row += 5; 2284 cnt += 25; 2285 break; 2286 default: 2287 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2288 } 2289 CHKMEMQ; 2290 } 2291 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2292 2293 2294 2295 /* 2296 Apply (L + D)^-1 where D is the block diagonal 2297 */ 2298 for (i=0, row=0; i<m; i++) { 2299 sz = diag[row] - ii[row]; 2300 v1 = a->a + ii[row]; 2301 idx = a->j + ii[row]; 2302 CHKMEMQ; 2303 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2304 switch (sizes[i]){ 2305 case 1: 2306 2307 sum1 = t[row]; 2308 for(n = 0; n<sz-1; n+=2) { 2309 i1 = idx[0]; 2310 i2 = idx[1]; 2311 idx += 2; 2312 tmp0 = t[i1]; 2313 tmp1 = t[i2]; 2314 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2315 } 2316 2317 if (n == sz-1){ 2318 tmp0 = t[*idx]; 2319 sum1 -= *v1 * tmp0; 2320 } 2321 x[row] += t[row] = sum1*(*ibdiag++); row++; 2322 break; 2323 case 2: 2324 v2 = a->a + ii[row+1]; 2325 sum1 = t[row]; 2326 sum2 = t[row+1]; 2327 for(n = 0; n<sz-1; n+=2) { 2328 i1 = idx[0]; 2329 i2 = idx[1]; 2330 idx += 2; 2331 tmp0 = t[i1]; 2332 tmp1 = t[i2]; 2333 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2334 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2335 } 2336 2337 if (n == sz-1){ 2338 tmp0 = t[*idx]; 2339 sum1 -= v1[0] * tmp0; 2340 sum2 -= v2[0] * tmp0; 2341 } 2342 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2343 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2344 ibdiag += 4; row += 2; 2345 break; 2346 case 3: 2347 v2 = a->a + ii[row+1]; 2348 v3 = a->a + ii[row+2]; 2349 sum1 = t[row]; 2350 sum2 = t[row+1]; 2351 sum3 = t[row+2]; 2352 for(n = 0; n<sz-1; n+=2) { 2353 i1 = idx[0]; 2354 i2 = idx[1]; 2355 idx += 2; 2356 tmp0 = t[i1]; 2357 tmp1 = t[i2]; 2358 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2359 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2360 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2361 } 2362 2363 if (n == sz-1){ 2364 tmp0 = t[*idx]; 2365 sum1 -= v1[0] * tmp0; 2366 sum2 -= v2[0] * tmp0; 2367 sum3 -= v3[0] * tmp0; 2368 } 2369 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2370 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2371 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2372 ibdiag += 9; row += 3; 2373 break; 2374 case 4: 2375 v2 = a->a + ii[row+1]; 2376 v3 = a->a + ii[row+2]; 2377 v4 = a->a + ii[row+3]; 2378 sum1 = t[row]; 2379 sum2 = t[row+1]; 2380 sum3 = t[row+2]; 2381 sum4 = t[row+3]; 2382 for(n = 0; n<sz-1; n+=2) { 2383 i1 = idx[0]; 2384 i2 = idx[1]; 2385 idx += 2; 2386 tmp0 = t[i1]; 2387 tmp1 = t[i2]; 2388 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2389 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2390 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2391 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2392 } 2393 2394 if (n == sz-1){ 2395 tmp0 = t[*idx]; 2396 sum1 -= v1[0] * tmp0; 2397 sum2 -= v2[0] * tmp0; 2398 sum3 -= v3[0] * tmp0; 2399 sum4 -= v4[0] * tmp0; 2400 } 2401 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2402 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2403 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2404 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2405 ibdiag += 16; row += 4; 2406 break; 2407 case 5: 2408 v2 = a->a + ii[row+1]; 2409 v3 = a->a + ii[row+2]; 2410 v4 = a->a + ii[row+3]; 2411 v5 = a->a + ii[row+4]; 2412 sum1 = t[row]; 2413 sum2 = t[row+1]; 2414 sum3 = t[row+2]; 2415 sum4 = t[row+3]; 2416 sum5 = t[row+4]; 2417 for(n = 0; n<sz-1; n+=2) { 2418 i1 = idx[0]; 2419 i2 = idx[1]; 2420 idx += 2; 2421 tmp0 = t[i1]; 2422 tmp1 = t[i2]; 2423 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2424 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2425 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2426 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2427 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2428 } 2429 2430 if (n == sz-1){ 2431 tmp0 = t[*idx]; 2432 sum1 -= v1[0] * tmp0; 2433 sum2 -= v2[0] * tmp0; 2434 sum3 -= v3[0] * tmp0; 2435 sum4 -= v4[0] * tmp0; 2436 sum5 -= v5[0] * tmp0; 2437 } 2438 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2439 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2440 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2441 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2442 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2443 ibdiag += 25; row += 5; 2444 break; 2445 default: 2446 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2447 } 2448 CHKMEMQ; 2449 } 2450 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2451 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2452 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2453 } 2454 PetscFunctionReturn(0); 2455 } 2456 2457 #undef __FUNCT__ 2458 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 2459 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2460 { 2461 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2462 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2463 const MatScalar *bdiag = a->inode.bdiag; 2464 const PetscScalar *b; 2465 PetscErrorCode ierr; 2466 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2467 const PetscInt *sizes = a->inode.size; 2468 2469 PetscFunctionBegin; 2470 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2471 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2472 cnt = 0; 2473 for (i=0, row=0; i<m; i++) { 2474 switch (sizes[i]){ 2475 case 1: 2476 x[row] = b[row]*bdiag[cnt++];row++; 2477 break; 2478 case 2: 2479 x1 = b[row]; x2 = b[row+1]; 2480 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2481 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2482 x[row++] = tmp1; 2483 x[row++] = tmp2; 2484 cnt += 4; 2485 break; 2486 case 3: 2487 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2488 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2489 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2490 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2491 x[row++] = tmp1; 2492 x[row++] = tmp2; 2493 x[row++] = tmp3; 2494 cnt += 9; 2495 break; 2496 case 4: 2497 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2498 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2499 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2500 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2501 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2502 x[row++] = tmp1; 2503 x[row++] = tmp2; 2504 x[row++] = tmp3; 2505 x[row++] = tmp4; 2506 cnt += 16; 2507 break; 2508 case 5: 2509 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2510 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2511 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2512 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2513 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2514 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2515 x[row++] = tmp1; 2516 x[row++] = tmp2; 2517 x[row++] = tmp3; 2518 x[row++] = tmp4; 2519 x[row++] = tmp5; 2520 cnt += 25; 2521 break; 2522 default: 2523 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2524 } 2525 } 2526 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2527 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2528 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2529 PetscFunctionReturn(0); 2530 } 2531 2532 /* 2533 samestructure indicates that the matrix has not changed its nonzero structure so we 2534 do not need to recompute the inodes 2535 */ 2536 #undef __FUNCT__ 2537 #define __FUNCT__ "Mat_CheckInode" 2538 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2539 { 2540 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2541 PetscErrorCode ierr; 2542 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2543 PetscTruth flag; 2544 2545 PetscFunctionBegin; 2546 if (!a->inode.use) PetscFunctionReturn(0); 2547 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2548 2549 2550 m = A->rmap->n; 2551 if (a->inode.size) {ns = a->inode.size;} 2552 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2553 2554 i = 0; 2555 node_count = 0; 2556 idx = a->j; 2557 ii = a->i; 2558 while (i < m){ /* For each row */ 2559 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2560 /* Limits the number of elements in a node to 'a->inode.limit' */ 2561 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2562 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2563 if (nzy != nzx) break; 2564 idy += nzx; /* Same nonzero pattern */ 2565 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2566 if (!flag) break; 2567 } 2568 ns[node_count++] = blk_size; 2569 idx += blk_size*nzx; 2570 i = j; 2571 } 2572 /* If not enough inodes found,, do not use inode version of the routines */ 2573 if (!a->inode.size && m && node_count > .9*m) { 2574 ierr = PetscFree(ns);CHKERRQ(ierr); 2575 a->inode.node_count = 0; 2576 a->inode.size = PETSC_NULL; 2577 a->inode.use = PETSC_FALSE; 2578 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2579 } else { 2580 A->ops->mult = MatMult_SeqAIJ_Inode; 2581 A->ops->sor = MatSOR_SeqAIJ_Inode; 2582 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 2583 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 2584 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 2585 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 2586 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 2587 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 2588 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 2589 a->inode.node_count = node_count; 2590 a->inode.size = ns; 2591 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2592 } 2593 PetscFunctionReturn(0); 2594 } 2595 2596 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 2597 PetscInt __k, *__vi; \ 2598 __vi = aj + ai[row]; \ 2599 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 2600 __vi = aj + adiag[row]; \ 2601 cols[nzl] = __vi[0];\ 2602 __vi = aj + adiag[row+1]+1;\ 2603 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 2604 2605 #undef __FUNCT__ 2606 #define __FUNCT__ "Mat_CheckInode_FactorLU" 2607 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 2608 { 2609 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2610 PetscErrorCode ierr; 2611 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 2612 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 2613 PetscTruth flag; 2614 2615 PetscFunctionBegin; 2616 if (!a->inode.use) PetscFunctionReturn(0); 2617 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2618 2619 m = A->rmap->n; 2620 if (a->inode.size) {ns = a->inode.size;} 2621 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2622 2623 i = 0; 2624 node_count = 0; 2625 while (i < m){ /* For each row */ 2626 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 2627 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 2628 nzx = nzl1 + nzu1 + 1; 2629 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 2630 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 2631 2632 /* Limits the number of elements in a node to 'a->inode.limit' */ 2633 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2634 nzl2 = ai[j+1] - ai[j]; 2635 nzu2 = adiag[j] - adiag[j+1] - 1; 2636 nzy = nzl2 + nzu2 + 1; 2637 if( nzy != nzx) break; 2638 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 2639 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 2640 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2641 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 2642 ierr = PetscFree(cols2);CHKERRQ(ierr); 2643 } 2644 ns[node_count++] = blk_size; 2645 ierr = PetscFree(cols1);CHKERRQ(ierr); 2646 i = j; 2647 } 2648 /* If not enough inodes found,, do not use inode version of the routines */ 2649 if (!a->inode.size && m && node_count > .9*m) { 2650 ierr = PetscFree(ns);CHKERRQ(ierr); 2651 a->inode.node_count = 0; 2652 a->inode.size = PETSC_NULL; 2653 a->inode.use = PETSC_FALSE; 2654 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2655 } else { 2656 A->ops->mult = 0; 2657 A->ops->sor = 0; 2658 A->ops->multadd = 0; 2659 A->ops->getrowij = 0; 2660 A->ops->restorerowij = 0; 2661 A->ops->getcolumnij = 0; 2662 A->ops->restorecolumnij = 0; 2663 A->ops->coloringpatch = 0; 2664 A->ops->multdiagonalblock = 0; 2665 a->inode.node_count = node_count; 2666 a->inode.size = ns; 2667 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2668 } 2669 PetscFunctionReturn(0); 2670 } 2671 2672 /* 2673 This is really ugly. if inodes are used this replaces the 2674 permutations with ones that correspond to rows/cols of the matrix 2675 rather then inode blocks 2676 */ 2677 #undef __FUNCT__ 2678 #define __FUNCT__ "MatInodeAdjustForInodes" 2679 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2680 { 2681 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2682 2683 PetscFunctionBegin; 2684 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2685 if (f) { 2686 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2687 } 2688 PetscFunctionReturn(0); 2689 } 2690 2691 EXTERN_C_BEGIN 2692 #undef __FUNCT__ 2693 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 2694 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 2695 { 2696 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2697 PetscErrorCode ierr; 2698 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2699 const PetscInt *ridx,*cidx; 2700 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2701 PetscInt nslim_col,*ns_col; 2702 IS ris = *rperm,cis = *cperm; 2703 2704 PetscFunctionBegin; 2705 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2706 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2707 2708 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2709 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2710 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 2711 2712 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2713 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2714 2715 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2716 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2717 2718 /* Construct the permutations for rows*/ 2719 for (i=0,row = 0; i<nslim_row; ++i){ 2720 indx = ridx[i]; 2721 start_val = tns[indx]; 2722 end_val = tns[indx + 1]; 2723 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2724 } 2725 2726 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2727 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2728 2729 /* Construct permutations for columns */ 2730 for (i=0,col=0; i<nslim_col; ++i){ 2731 indx = cidx[i]; 2732 start_val = tns[indx]; 2733 end_val = tns[indx + 1]; 2734 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2735 } 2736 2737 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2738 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 2739 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2740 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 2741 2742 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2743 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2744 2745 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2746 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 2747 ierr = ISDestroy(cis);CHKERRQ(ierr); 2748 ierr = ISDestroy(ris);CHKERRQ(ierr); 2749 ierr = PetscFree(tns);CHKERRQ(ierr); 2750 PetscFunctionReturn(0); 2751 } 2752 EXTERN_C_END 2753 2754 #undef __FUNCT__ 2755 #define __FUNCT__ "MatInodeGetInodeSizes" 2756 /*@C 2757 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2758 2759 Collective on Mat 2760 2761 Input Parameter: 2762 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2763 2764 Output Parameter: 2765 + node_count - no of inodes present in the matrix. 2766 . sizes - an array of size node_count,with sizes of each inode. 2767 - limit - the max size used to generate the inodes. 2768 2769 Level: advanced 2770 2771 Notes: This routine returns some internal storage information 2772 of the matrix, it is intended to be used by advanced users. 2773 It should be called after the matrix is assembled. 2774 The contents of the sizes[] array should not be changed. 2775 PETSC_NULL may be passed for information not requested. 2776 2777 .keywords: matrix, seqaij, get, inode 2778 2779 .seealso: MatGetInfo() 2780 @*/ 2781 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2782 { 2783 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2784 2785 PetscFunctionBegin; 2786 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2787 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2788 if (f) { 2789 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2790 } 2791 PetscFunctionReturn(0); 2792 } 2793 2794 EXTERN_C_BEGIN 2795 #undef __FUNCT__ 2796 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 2797 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2798 { 2799 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2800 2801 PetscFunctionBegin; 2802 if (node_count) *node_count = a->inode.node_count; 2803 if (sizes) *sizes = a->inode.size; 2804 if (limit) *limit = a->inode.limit; 2805 PetscFunctionReturn(0); 2806 } 2807 EXTERN_C_END 2808