1 2 #ifdef PETSC_RCS_HEADER 3 static char vcid[] = "$Id: baij.c,v 1.112 1997/09/26 02:19:29 bsmith Exp bsmith $"; 4 #endif 5 6 /* 7 Defines the basic matrix operations for the BAIJ (compressed row) 8 matrix storage format. 9 */ 10 11 #include "pinclude/pviewer.h" 12 #include "sys.h" 13 #include "src/mat/impls/baij/seq/baij.h" 14 #include "src/vec/vecimpl.h" 15 #include "src/inline/spops.h" 16 #include "petsc.h" 17 18 19 /* 20 Adds diagonal pointers to sparse matrix structure. 21 */ 22 23 #undef __FUNC__ 24 #define __FUNC__ "MatMarkDiag_SeqBAIJ" 25 int MatMarkDiag_SeqBAIJ(Mat A) 26 { 27 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 28 int i,j, *diag, m = a->mbs; 29 30 diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag); 31 PLogObjectMemory(A,(m+1)*sizeof(int)); 32 for ( i=0; i<m; i++ ) { 33 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 34 if (a->j[j] == i) { 35 diag[i] = j; 36 break; 37 } 38 } 39 } 40 a->diag = diag; 41 return 0; 42 } 43 44 45 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**); 46 47 #undef __FUNC__ 48 #define __FUNC__ "MatGetRowIJ_SeqBAIJ" 49 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja, 50 PetscTruth *done) 51 { 52 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 53 int ierr, n = a->mbs,i; 54 55 *nn = n; 56 if (!ia) return 0; 57 if (symmetric) { 58 ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr); 59 } else if (oshift == 1) { 60 /* temporarily add 1 to i and j indices */ 61 int nz = a->i[n] + 1; 62 for ( i=0; i<nz; i++ ) a->j[i]++; 63 for ( i=0; i<n+1; i++ ) a->i[i]++; 64 *ia = a->i; *ja = a->j; 65 } else { 66 *ia = a->i; *ja = a->j; 67 } 68 69 return 0; 70 } 71 72 #undef __FUNC__ 73 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ" 74 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja, 75 PetscTruth *done) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 78 int i,n = a->mbs; 79 80 if (!ia) return 0; 81 if (symmetric) { 82 PetscFree(*ia); 83 PetscFree(*ja); 84 } else if (oshift == 1) { 85 int nz = a->i[n]; 86 for ( i=0; i<nz; i++ ) a->j[i]--; 87 for ( i=0; i<n+1; i++ ) a->i[i]--; 88 } 89 return 0; 90 } 91 92 93 #undef __FUNC__ 94 #define __FUNC__ "MatView_SeqBAIJ_Binary" 95 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer) 96 { 97 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 98 int i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2; 99 Scalar *aa; 100 101 ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr); 102 col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens); 103 col_lens[0] = MAT_COOKIE; 104 105 col_lens[1] = a->m; 106 col_lens[2] = a->n; 107 col_lens[3] = a->nz*bs2; 108 109 /* store lengths of each row and write (including header) to file */ 110 count = 0; 111 for ( i=0; i<a->mbs; i++ ) { 112 for ( j=0; j<bs; j++ ) { 113 col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]); 114 } 115 } 116 ierr = PetscBinaryWrite(fd,col_lens,4+a->m,PETSC_INT,1); CHKERRQ(ierr); 117 PetscFree(col_lens); 118 119 /* store column indices (zero start index) */ 120 jj = (int *) PetscMalloc( (a->nz+1)*bs2*sizeof(int) ); CHKPTRQ(jj); 121 count = 0; 122 for ( i=0; i<a->mbs; i++ ) { 123 for ( j=0; j<bs; j++ ) { 124 for ( k=a->i[i]; k<a->i[i+1]; k++ ) { 125 for ( l=0; l<bs; l++ ) { 126 jj[count++] = bs*a->j[k] + l; 127 } 128 } 129 } 130 } 131 ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,PETSC_INT,0); CHKERRQ(ierr); 132 PetscFree(jj); 133 134 /* store nonzero values */ 135 aa = (Scalar *) PetscMalloc((a->nz+1)*bs2*sizeof(Scalar)); CHKPTRQ(aa); 136 count = 0; 137 for ( i=0; i<a->mbs; i++ ) { 138 for ( j=0; j<bs; j++ ) { 139 for ( k=a->i[i]; k<a->i[i+1]; k++ ) { 140 for ( l=0; l<bs; l++ ) { 141 aa[count++] = a->a[bs2*k + l*bs + j]; 142 } 143 } 144 } 145 } 146 ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,PETSC_SCALAR,0); CHKERRQ(ierr); 147 PetscFree(aa); 148 return 0; 149 } 150 151 #undef __FUNC__ 152 #define __FUNC__ "MatView_SeqBAIJ_ASCII" 153 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer) 154 { 155 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 156 int ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2; 157 FILE *fd; 158 char *outputname; 159 160 ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr); 161 ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr); 162 ierr = ViewerGetFormat(viewer,&format); 163 if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) { 164 fprintf(fd," block size is %d\n",bs); 165 } 166 else if (format == VIEWER_FORMAT_ASCII_MATLAB) { 167 SETERRQ(1,0,"Matlab format not supported"); 168 } 169 else if (format == VIEWER_FORMAT_ASCII_COMMON) { 170 for ( i=0; i<a->mbs; i++ ) { 171 for ( j=0; j<bs; j++ ) { 172 fprintf(fd,"row %d:",i*bs+j); 173 for ( k=a->i[i]; k<a->i[i+1]; k++ ) { 174 for ( l=0; l<bs; l++ ) { 175 #if defined(PETSC_COMPLEX) 176 if (imag(a->a[bs2*k + l*bs + j]) > 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0) 177 fprintf(fd," %d %g + %g i",bs*a->j[k]+l, 178 real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j])); 179 else if (imag(a->a[bs2*k + l*bs + j]) < 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0) 180 fprintf(fd," %d %g - %g i",bs*a->j[k]+l, 181 real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j])); 182 else if (real(a->a[bs2*k + l*bs + j]) != 0.0) 183 fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j])); 184 #else 185 if (a->a[bs2*k + l*bs + j] != 0.0) 186 fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]); 187 #endif 188 } 189 } 190 fprintf(fd,"\n"); 191 } 192 } 193 } 194 else { 195 for ( i=0; i<a->mbs; i++ ) { 196 for ( j=0; j<bs; j++ ) { 197 fprintf(fd,"row %d:",i*bs+j); 198 for ( k=a->i[i]; k<a->i[i+1]; k++ ) { 199 for ( l=0; l<bs; l++ ) { 200 #if defined(PETSC_COMPLEX) 201 if (imag(a->a[bs2*k + l*bs + j]) > 0.0) { 202 fprintf(fd," %d %g + %g i",bs*a->j[k]+l, 203 real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j])); 204 } 205 else if (imag(a->a[bs2*k + l*bs + j]) < 0.0) { 206 fprintf(fd," %d %g - %g i",bs*a->j[k]+l, 207 real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j])); 208 } 209 else { 210 fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j])); 211 } 212 #else 213 fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]); 214 #endif 215 } 216 } 217 fprintf(fd,"\n"); 218 } 219 } 220 } 221 fflush(fd); 222 return 0; 223 } 224 225 #undef __FUNC__ 226 #define __FUNC__ "MatView_SeqBAIJ_Draw" 227 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer) 228 { 229 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *) A->data; 230 int row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2; 231 double xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r; 232 Scalar *aa; 233 Draw draw; 234 DrawButton button; 235 PetscTruth isnull; 236 237 ViewerDrawGetDraw(viewer,&draw); 238 ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0; 239 240 xr = a->n; yr = a->m; h = yr/10.0; w = xr/10.0; 241 xr += w; yr += h; xl = -w; yl = -h; 242 ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr); 243 /* loop over matrix elements drawing boxes */ 244 color = DRAW_BLUE; 245 for ( i=0,row=0; i<mbs; i++,row+=bs ) { 246 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 247 y_l = a->m - row - 1.0; y_r = y_l + 1.0; 248 x_l = a->j[j]*bs; x_r = x_l + 1.0; 249 aa = a->a + j*bs2; 250 for ( k=0; k<bs; k++ ) { 251 for ( l=0; l<bs; l++ ) { 252 if (PetscReal(*aa++) >= 0.) continue; 253 DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color); 254 } 255 } 256 } 257 } 258 color = DRAW_CYAN; 259 for ( i=0,row=0; i<mbs; i++,row+=bs ) { 260 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 261 y_l = a->m - row - 1.0; y_r = y_l + 1.0; 262 x_l = a->j[j]*bs; x_r = x_l + 1.0; 263 aa = a->a + j*bs2; 264 for ( k=0; k<bs; k++ ) { 265 for ( l=0; l<bs; l++ ) { 266 if (PetscReal(*aa++) != 0.) continue; 267 DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color); 268 } 269 } 270 } 271 } 272 273 color = DRAW_RED; 274 for ( i=0,row=0; i<mbs; i++,row+=bs ) { 275 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 276 y_l = a->m - row - 1.0; y_r = y_l + 1.0; 277 x_l = a->j[j]*bs; x_r = x_l + 1.0; 278 aa = a->a + j*bs2; 279 for ( k=0; k<bs; k++ ) { 280 for ( l=0; l<bs; l++ ) { 281 if (PetscReal(*aa++) <= 0.) continue; 282 DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color); 283 } 284 } 285 } 286 } 287 288 DrawFlush(draw); 289 DrawGetPause(draw,&pause); 290 if (pause >= 0) { PetscSleep(pause); return 0;} 291 292 /* allow the matrix to zoom or shrink */ 293 ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0); 294 while (button != BUTTON_RIGHT) { 295 DrawClear(draw); 296 if (button == BUTTON_LEFT) scale = .5; 297 else if (button == BUTTON_CENTER) scale = 2.; 298 xl = scale*(xl + w - xc) + xc - w*scale; 299 xr = scale*(xr - w - xc) + xc + w*scale; 300 yl = scale*(yl + h - yc) + yc - h*scale; 301 yr = scale*(yr - h - yc) + yc + h*scale; 302 w *= scale; h *= scale; 303 ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr); 304 color = DRAW_BLUE; 305 for ( i=0,row=0; i<mbs; i++,row+=bs ) { 306 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 307 y_l = a->m - row - 1.0; y_r = y_l + 1.0; 308 x_l = a->j[j]*bs; x_r = x_l + 1.0; 309 aa = a->a + j*bs2; 310 for ( k=0; k<bs; k++ ) { 311 for ( l=0; l<bs; l++ ) { 312 if (PetscReal(*aa++) >= 0.) continue; 313 DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color); 314 } 315 } 316 } 317 } 318 color = DRAW_CYAN; 319 for ( i=0,row=0; i<mbs; i++,row+=bs ) { 320 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 321 y_l = a->m - row - 1.0; y_r = y_l + 1.0; 322 x_l = a->j[j]*bs; x_r = x_l + 1.0; 323 aa = a->a + j*bs2; 324 for ( k=0; k<bs; k++ ) { 325 for ( l=0; l<bs; l++ ) { 326 if (PetscReal(*aa++) != 0.) continue; 327 DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color); 328 } 329 } 330 } 331 } 332 333 color = DRAW_RED; 334 for ( i=0,row=0; i<mbs; i++,row+=bs ) { 335 for ( j=a->i[i]; j<a->i[i+1]; j++ ) { 336 y_l = a->m - row - 1.0; y_r = y_l + 1.0; 337 x_l = a->j[j]*bs; x_r = x_l + 1.0; 338 aa = a->a + j*bs2; 339 for ( k=0; k<bs; k++ ) { 340 for ( l=0; l<bs; l++ ) { 341 if (PetscReal(*aa++) <= 0.) continue; 342 DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color); 343 } 344 } 345 } 346 } 347 ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0); 348 } 349 return 0; 350 } 351 352 #undef __FUNC__ 353 #define __FUNC__ "MatView_SeqBAIJ" 354 int MatView_SeqBAIJ(PetscObject obj,Viewer viewer) 355 { 356 Mat A = (Mat) obj; 357 ViewerType vtype; 358 int ierr; 359 360 ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr); 361 if (vtype == MATLAB_VIEWER) { 362 SETERRQ(1,0,"Matlab viewer not supported"); 363 } 364 else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){ 365 return MatView_SeqBAIJ_ASCII(A,viewer); 366 } 367 else if (vtype == BINARY_FILE_VIEWER) { 368 return MatView_SeqBAIJ_Binary(A,viewer); 369 } 370 else if (vtype == DRAW_VIEWER) { 371 return MatView_SeqBAIJ_Draw(A,viewer); 372 } 373 return 0; 374 } 375 376 #define CHUNKSIZE 10 377 378 #undef __FUNC__ 379 #define __FUNC__ "MatSetValues_SeqBAIJ" 380 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is) 381 { 382 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 383 int *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted; 384 int *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented; 385 int *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol; 386 int ridx,cidx,bs2=a->bs2; 387 Scalar *ap,value,*aa=a->a,*bap; 388 389 for ( k=0; k<m; k++ ) { /* loop over added rows */ 390 row = im[k]; brow = row/bs; 391 #if defined(PETSC_BOPT_g) 392 if (row < 0) SETERRQ(1,0,"Negative row"); 393 if (row >= a->m) SETERRQ(1,0,"Row too large"); 394 #endif 395 rp = aj + ai[brow]; 396 ap = aa + bs2*ai[brow]; 397 rmax = imax[brow]; 398 nrow = ailen[brow]; 399 low = 0; 400 for ( l=0; l<n; l++ ) { /* loop over added columns */ 401 #if defined(PETSC_BOPT_g) 402 if (in[l] < 0) SETERRQ(1,0,"Negative column"); 403 if (in[l] >= a->n) SETERRQ(1,0,"Column too large"); 404 #endif 405 col = in[l]; bcol = col/bs; 406 ridx = row % bs; cidx = col % bs; 407 if (roworiented) { 408 value = *v++; 409 } 410 else { 411 value = v[k + l*m]; 412 } 413 if (!sorted) low = 0; high = nrow; 414 while (high-low > 7) { 415 t = (low+high)/2; 416 if (rp[t] > bcol) high = t; 417 else low = t; 418 } 419 for ( i=low; i<high; i++ ) { 420 if (rp[i] > bcol) break; 421 if (rp[i] == bcol) { 422 bap = ap + bs2*i + bs*cidx + ridx; 423 if (is == ADD_VALUES) *bap += value; 424 else *bap = value; 425 goto noinsert1; 426 } 427 } 428 if (nonew == 1) goto noinsert1; 429 else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); 430 if (nrow >= rmax) { 431 /* there is no extra room in row, therefore enlarge */ 432 int new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; 433 Scalar *new_a; 434 435 if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); 436 437 /* Malloc new storage space */ 438 len = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); 439 new_a = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); 440 new_j = (int *) (new_a + bs2*new_nz); 441 new_i = new_j + new_nz; 442 443 /* copy over old data into new slots */ 444 for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} 445 for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} 446 PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); 447 len = (new_nz - CHUNKSIZE - ai[brow] - nrow); 448 PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, 449 len*sizeof(int)); 450 PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); 451 PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); 452 PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), 453 aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar)); 454 /* free up old matrix storage */ 455 PetscFree(a->a); 456 if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} 457 aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j; 458 a->singlemalloc = 1; 459 460 rp = aj + ai[brow]; ap = aa + bs2*ai[brow]; 461 rmax = imax[brow] = imax[brow] + CHUNKSIZE; 462 PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); 463 a->maxnz += bs2*CHUNKSIZE; 464 a->reallocs++; 465 a->nz++; 466 } 467 N = nrow++ - 1; 468 /* shift up all the later entries in this row */ 469 for ( ii=N; ii>=i; ii-- ) { 470 rp[ii+1] = rp[ii]; 471 PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); 472 } 473 if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar)); 474 rp[i] = bcol; 475 ap[bs2*i + bs*cidx + ridx] = value; 476 noinsert1:; 477 low = i; 478 } 479 ailen[brow] = nrow; 480 } 481 return 0; 482 } 483 484 #undef __FUNC__ 485 #define __FUNC__ "MatSetValuesBlocked_SeqBAIJ" 486 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is) 487 { 488 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 489 int *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted; 490 int *imax=a->imax,*ai=a->i,*ailen=a->ilen, roworiented=a->roworiented; 491 int *aj=a->j,nonew=a->nonew,bs2=a->bs2,bs=a->bs,stepval; 492 Scalar *ap,*value=v,*aa=a->a,*bap; 493 494 if (roworiented) { 495 stepval = (n-1)*bs; 496 } else { 497 stepval = (m-1)*bs; 498 } 499 for ( k=0; k<m; k++ ) { /* loop over added rows */ 500 row = im[k]; 501 #if defined(PETSC_BOPT_g) 502 if (row < 0) SETERRQ(1,0,"Negative row"); 503 if (row >= a->mbs) SETERRQ(1,0,"Row too large"); 504 #endif 505 rp = aj + ai[row]; 506 ap = aa + bs2*ai[row]; 507 rmax = imax[row]; 508 nrow = ailen[row]; 509 low = 0; 510 for ( l=0; l<n; l++ ) { /* loop over added columns */ 511 #if defined(PETSC_BOPT_g) 512 if (in[l] < 0) SETERRQ(1,0,"Negative column"); 513 if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large"); 514 #endif 515 col = in[l]; 516 if (roworiented) { 517 value = v + k*(stepval+bs)*bs + l*bs; 518 } else { 519 value = v + l*(stepval+bs)*bs + k*bs; 520 } 521 if (!sorted) low = 0; high = nrow; 522 while (high-low > 7) { 523 t = (low+high)/2; 524 if (rp[t] > col) high = t; 525 else low = t; 526 } 527 for ( i=low; i<high; i++ ) { 528 if (rp[i] > col) break; 529 if (rp[i] == col) { 530 bap = ap + bs2*i; 531 if (roworiented) { 532 if (is == ADD_VALUES) { 533 for ( ii=0; ii<bs; ii++,value+=stepval ) { 534 for (jj=ii; jj<bs2; jj+=bs ) { 535 bap[jj] += *value++; 536 } 537 } 538 } else { 539 for ( ii=0; ii<bs; ii++,value+=stepval ) { 540 for (jj=ii; jj<bs2; jj+=bs ) { 541 bap[jj] = *value++; 542 } 543 } 544 } 545 } else { 546 if (is == ADD_VALUES) { 547 for ( ii=0; ii<bs; ii++,value+=stepval ) { 548 for (jj=0; jj<bs; jj++ ) { 549 *bap++ += *value++; 550 } 551 } 552 } else { 553 for ( ii=0; ii<bs; ii++,value+=stepval ) { 554 for (jj=0; jj<bs; jj++ ) { 555 *bap++ = *value++; 556 } 557 } 558 } 559 } 560 goto noinsert2; 561 } 562 } 563 if (nonew == 1) goto noinsert2; 564 else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); 565 if (nrow >= rmax) { 566 /* there is no extra room in row, therefore enlarge */ 567 int new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; 568 Scalar *new_a; 569 570 if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); 571 572 /* malloc new storage space */ 573 len = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); 574 new_a = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); 575 new_j = (int *) (new_a + bs2*new_nz); 576 new_i = new_j + new_nz; 577 578 /* copy over old data into new slots */ 579 for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];} 580 for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} 581 PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int)); 582 len = (new_nz - CHUNKSIZE - ai[row] - nrow); 583 PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,len*sizeof(int)); 584 PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar)); 585 PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); 586 PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar)); 587 /* free up old matrix storage */ 588 PetscFree(a->a); 589 if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} 590 aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j; 591 a->singlemalloc = 1; 592 593 rp = aj + ai[row]; ap = aa + bs2*ai[row]; 594 rmax = imax[row] = imax[row] + CHUNKSIZE; 595 PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); 596 a->maxnz += bs2*CHUNKSIZE; 597 a->reallocs++; 598 a->nz++; 599 } 600 N = nrow++ - 1; 601 /* shift up all the later entries in this row */ 602 for ( ii=N; ii>=i; ii-- ) { 603 rp[ii+1] = rp[ii]; 604 PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); 605 } 606 if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar)); 607 rp[i] = col; 608 bap = ap + bs2*i; 609 if (roworiented) { 610 for ( ii=0; ii<bs; ii++,value+=stepval) { 611 for (jj=ii; jj<bs2; jj+=bs ) { 612 bap[jj] = *value++; 613 } 614 } 615 } else { 616 for ( ii=0; ii<bs; ii++,value+=stepval ) { 617 for (jj=0; jj<bs; jj++ ) { 618 *bap++ = *value++; 619 } 620 } 621 } 622 noinsert2:; 623 low = i; 624 } 625 ailen[row] = nrow; 626 } 627 return 0; 628 } 629 630 #undef __FUNC__ 631 #define __FUNC__ "MatGetSize_SeqBAIJ" 632 int MatGetSize_SeqBAIJ(Mat A,int *m,int *n) 633 { 634 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 635 if (m) *m = a->m; 636 if (n) *n = a->n; 637 return 0; 638 } 639 640 #undef __FUNC__ 641 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ" 642 int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n) 643 { 644 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 645 *m = 0; *n = a->m; 646 return 0; 647 } 648 649 #undef __FUNC__ 650 #define __FUNC__ "MatGetRow_SeqBAIJ" 651 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v) 652 { 653 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 654 int itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2; 655 Scalar *aa,*v_i,*aa_i; 656 657 bs = a->bs; 658 ai = a->i; 659 aj = a->j; 660 aa = a->a; 661 bs2 = a->bs2; 662 663 if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range"); 664 665 bn = row/bs; /* Block number */ 666 bp = row % bs; /* Block Position */ 667 M = ai[bn+1] - ai[bn]; 668 *nz = bs*M; 669 670 if (v) { 671 *v = 0; 672 if (*nz) { 673 *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v); 674 for ( i=0; i<M; i++ ) { /* for each block in the block row */ 675 v_i = *v + i*bs; 676 aa_i = aa + bs2*(ai[bn] + i); 677 for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];} 678 } 679 } 680 } 681 682 if (idx) { 683 *idx = 0; 684 if (*nz) { 685 *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx); 686 for ( i=0; i<M; i++ ) { /* for each block in the block row */ 687 idx_i = *idx + i*bs; 688 itmp = bs*aj[ai[bn] + i]; 689 for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;} 690 } 691 } 692 } 693 return 0; 694 } 695 696 #undef __FUNC__ 697 #define __FUNC__ "MatRestoreRow_SeqBAIJ" 698 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v) 699 { 700 if (idx) {if (*idx) PetscFree(*idx);} 701 if (v) {if (*v) PetscFree(*v);} 702 return 0; 703 } 704 705 #undef __FUNC__ 706 #define __FUNC__ "MatTranspose_SeqBAIJ" 707 int MatTranspose_SeqBAIJ(Mat A,Mat *B) 708 { 709 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 710 Mat C; 711 int i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col; 712 int *rows,*cols,bs2=a->bs2; 713 Scalar *array=a->a; 714 715 if (B==PETSC_NULL && mbs!=nbs) 716 SETERRQ(1,0,"Square matrix only for in-place"); 717 col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col); 718 PetscMemzero(col,(1+nbs)*sizeof(int)); 719 720 for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1; 721 ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr); 722 PetscFree(col); 723 rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows); 724 cols = rows + bs; 725 for ( i=0; i<mbs; i++ ) { 726 cols[0] = i*bs; 727 for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1; 728 len = ai[i+1] - ai[i]; 729 for ( j=0; j<len; j++ ) { 730 rows[0] = (*aj++)*bs; 731 for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1; 732 ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr); 733 array += bs2; 734 } 735 } 736 PetscFree(rows); 737 738 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 739 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 740 741 if (B != PETSC_NULL) { 742 *B = C; 743 } else { 744 /* This isn't really an in-place transpose */ 745 PetscFree(a->a); 746 if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);} 747 if (a->diag) PetscFree(a->diag); 748 if (a->ilen) PetscFree(a->ilen); 749 if (a->imax) PetscFree(a->imax); 750 if (a->solve_work) PetscFree(a->solve_work); 751 PetscFree(a); 752 PetscMemcpy(A,C,sizeof(struct _p_Mat)); 753 PetscHeaderDestroy(C); 754 } 755 return 0; 756 } 757 758 759 #undef __FUNC__ 760 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ" 761 int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode) 762 { 763 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 764 int fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax; 765 int m = a->m,*ip, N, *ailen = a->ilen; 766 int mbs = a->mbs, bs2 = a->bs2,rmax = 0; 767 Scalar *aa = a->a, *ap; 768 769 if (mode == MAT_FLUSH_ASSEMBLY) return 0; 770 771 if (m) rmax = ailen[0]; 772 for ( i=1; i<mbs; i++ ) { 773 /* move each row back by the amount of empty slots (fshift) before it*/ 774 fshift += imax[i-1] - ailen[i-1]; 775 rmax = PetscMax(rmax,ailen[i]); 776 if (fshift) { 777 ip = aj + ai[i]; ap = aa + bs2*ai[i]; 778 N = ailen[i]; 779 for ( j=0; j<N; j++ ) { 780 ip[j-fshift] = ip[j]; 781 PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar)); 782 } 783 } 784 ai[i] = ai[i-1] + ailen[i-1]; 785 } 786 if (mbs) { 787 fshift += imax[mbs-1] - ailen[mbs-1]; 788 ai[mbs] = ai[mbs-1] + ailen[mbs-1]; 789 } 790 /* reset ilen and imax for each row */ 791 for ( i=0; i<mbs; i++ ) { 792 ailen[i] = imax[i] = ai[i+1] - ai[i]; 793 } 794 a->nz = ai[mbs]; 795 796 /* diagonals may have moved, so kill the diagonal pointers */ 797 if (fshift && a->diag) { 798 PetscFree(a->diag); 799 PLogObjectMemory(A,-(m+1)*sizeof(int)); 800 a->diag = 0; 801 } 802 PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n", 803 m,a->n,a->bs,fshift*bs2,a->nz*bs2); 804 PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n", 805 a->reallocs); 806 PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax); 807 a->reallocs = 0; 808 A->info.nz_unneeded = (double)fshift*bs2; 809 810 return 0; 811 } 812 813 #undef __FUNC__ 814 #define __FUNC__ "MatZeroEntries_SeqBAIJ" 815 int MatZeroEntries_SeqBAIJ(Mat A) 816 { 817 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 818 PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar)); 819 return 0; 820 } 821 822 #undef __FUNC__ 823 #define __FUNC__ "MatDestroy_SeqBAIJ" 824 int MatDestroy_SeqBAIJ(PetscObject obj) 825 { 826 Mat A = (Mat) obj; 827 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 828 829 #if defined(PETSC_LOG) 830 PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz); 831 #endif 832 PetscFree(a->a); 833 if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);} 834 if (a->diag) PetscFree(a->diag); 835 if (a->ilen) PetscFree(a->ilen); 836 if (a->imax) PetscFree(a->imax); 837 if (a->solve_work) PetscFree(a->solve_work); 838 if (a->mult_work) PetscFree(a->mult_work); 839 PetscFree(a); 840 PLogObjectDestroy(A); 841 PetscHeaderDestroy(A); 842 return 0; 843 } 844 845 #undef __FUNC__ 846 #define __FUNC__ "MatSetOption_SeqBAIJ" 847 int MatSetOption_SeqBAIJ(Mat A,MatOption op) 848 { 849 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 850 if (op == MAT_ROW_ORIENTED) a->roworiented = 1; 851 else if (op == MAT_COLUMN_ORIENTED) a->roworiented = 0; 852 else if (op == MAT_COLUMNS_SORTED) a->sorted = 1; 853 else if (op == MAT_COLUMNS_UNSORTED) a->sorted = 0; 854 else if (op == MAT_NO_NEW_NONZERO_LOCATIONS) a->nonew = 1; 855 else if (op == MAT_NEW_NONZERO_LOCATION_ERROR) a->nonew = -1; 856 else if (op == MAT_NEW_NONZERO_ALLOCATION_ERROR) a->nonew = -2; 857 else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew = 0; 858 else if (op == MAT_ROWS_SORTED || 859 op == MAT_ROWS_UNSORTED || 860 op == MAT_SYMMETRIC || 861 op == MAT_STRUCTURALLY_SYMMETRIC || 862 op == MAT_YES_NEW_DIAGONALS || 863 op == MAT_IGNORE_OFF_PROC_ENTRIES) 864 PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n"); 865 else if (op == MAT_NO_NEW_DIAGONALS) 866 {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");} 867 else 868 {SETERRQ(PETSC_ERR_SUP,0,"unknown option");} 869 return 0; 870 } 871 872 873 /* -------------------------------------------------------*/ 874 /* Should check that shapes of vectors and matrices match */ 875 /* -------------------------------------------------------*/ 876 #include "pinclude/plapack.h" 877 878 #undef __FUNC__ 879 #define __FUNC__ "MatMult_SeqBAIJ_1" 880 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 881 { 882 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 883 register Scalar *x,*z,*v,sum; 884 int mbs=a->mbs,i,*idx,*ii,n; 885 886 VecGetArray_Fast(xx,x); 887 VecGetArray_Fast(zz,z); 888 889 idx = a->j; 890 v = a->a; 891 ii = a->i; 892 893 for ( i=0; i<mbs; i++ ) { 894 n = ii[1] - ii[0]; ii++; 895 sum = 0.0; 896 while (n--) sum += *v++ * x[*idx++]; 897 z[i] = sum; 898 } 899 VecRestoreArray_Fast(xx,x); 900 VecRestoreArray_Fast(zz,z); 901 PLogFlops(2*a->nz - a->m); 902 return 0; 903 } 904 905 #undef __FUNC__ 906 #define __FUNC__ "MatMult_SeqBAIJ_2" 907 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 908 { 909 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 910 register Scalar *x,*z,*v,*xb,sum1,sum2; 911 register Scalar x1,x2; 912 int mbs=a->mbs,i,*idx,*ii; 913 int j,n; 914 915 VecGetArray_Fast(xx,x); 916 VecGetArray_Fast(zz,z); 917 918 idx = a->j; 919 v = a->a; 920 ii = a->i; 921 922 for ( i=0; i<mbs; i++ ) { 923 n = ii[1] - ii[0]; ii++; 924 sum1 = 0.0; sum2 = 0.0; 925 for ( j=0; j<n; j++ ) { 926 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 927 sum1 += v[0]*x1 + v[2]*x2; 928 sum2 += v[1]*x1 + v[3]*x2; 929 v += 4; 930 } 931 z[0] = sum1; z[1] = sum2; 932 z += 2; 933 } 934 VecRestoreArray_Fast(xx,x); 935 VecRestoreArray_Fast(zz,z); 936 PLogFlops(4*a->nz - a->m); 937 return 0; 938 } 939 940 #undef __FUNC__ 941 #define __FUNC__ "MatMult_SeqBAIJ_3" 942 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 943 { 944 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 945 register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3; 946 int mbs=a->mbs,i,*idx,*ii,j,n; 947 948 VecGetArray_Fast(xx,x); 949 VecGetArray_Fast(zz,z); 950 951 idx = a->j; 952 v = a->a; 953 ii = a->i; 954 955 for ( i=0; i<mbs; i++ ) { 956 n = ii[1] - ii[0]; ii++; 957 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 958 for ( j=0; j<n; j++ ) { 959 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 960 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 961 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 962 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 963 v += 9; 964 } 965 z[0] = sum1; z[1] = sum2; z[2] = sum3; 966 z += 3; 967 } 968 VecRestoreArray_Fast(xx,x); 969 VecRestoreArray_Fast(zz,z); 970 PLogFlops(18*a->nz - a->m); 971 return 0; 972 } 973 974 #undef __FUNC__ 975 #define __FUNC__ "MatMult_SeqBAIJ_4" 976 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 977 { 978 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 979 register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4; 980 register Scalar x1,x2,x3,x4; 981 int mbs=a->mbs,i,*idx,*ii; 982 int j,n; 983 984 VecGetArray_Fast(xx,x); 985 VecGetArray_Fast(zz,z); 986 987 idx = a->j; 988 v = a->a; 989 ii = a->i; 990 991 for ( i=0; i<mbs; i++ ) { 992 n = ii[1] - ii[0]; ii++; 993 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 994 for ( j=0; j<n; j++ ) { 995 xb = x + 4*(*idx++); 996 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 997 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 998 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 999 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1000 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1001 v += 16; 1002 } 1003 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1004 z += 4; 1005 } 1006 VecRestoreArray_Fast(xx,x); 1007 VecRestoreArray_Fast(zz,z); 1008 PLogFlops(32*a->nz - a->m); 1009 return 0; 1010 } 1011 1012 #undef __FUNC__ 1013 #define __FUNC__ "MatMult_SeqBAIJ_5" 1014 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 1015 { 1016 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1017 register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5; 1018 register Scalar x1,x2,x3,x4,x5; 1019 int mbs=a->mbs,i,*idx,*ii,j,n; 1020 1021 VecGetArray_Fast(xx,x); 1022 VecGetArray_Fast(zz,z); 1023 1024 idx = a->j; 1025 v = a->a; 1026 ii = a->i; 1027 1028 for ( i=0; i<mbs; i++ ) { 1029 n = ii[1] - ii[0]; ii++; 1030 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 1031 for ( j=0; j<n; j++ ) { 1032 xb = x + 5*(*idx++); 1033 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1034 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1035 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1036 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1037 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1038 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1039 v += 25; 1040 } 1041 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1042 z += 5; 1043 } 1044 VecRestoreArray_Fast(xx,x); 1045 VecRestoreArray_Fast(zz,z); 1046 PLogFlops(50*a->nz - a->m); 1047 return 0; 1048 } 1049 1050 #undef __FUNC__ 1051 #define __FUNC__ "MatMult_SeqBAIJ_7" 1052 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1053 { 1054 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1055 register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1056 register Scalar x1,x2,x3,x4,x5,x6,x7; 1057 int mbs=a->mbs,i,*idx,*ii,j,n; 1058 1059 VecGetArray_Fast(xx,x); 1060 VecGetArray_Fast(zz,z); 1061 1062 idx = a->j; 1063 v = a->a; 1064 ii = a->i; 1065 1066 for ( i=0; i<mbs; i++ ) { 1067 n = ii[1] - ii[0]; ii++; 1068 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1069 for ( j=0; j<n; j++ ) { 1070 xb = x + 7*(*idx++); 1071 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1072 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1073 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1074 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1075 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1076 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1077 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1078 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1079 v += 49; 1080 } 1081 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1082 z += 7; 1083 } 1084 1085 VecRestoreArray_Fast(xx,x); 1086 VecRestoreArray_Fast(zz,z); 1087 PLogFlops(98*a->nz - a->m); 1088 return 0; 1089 } 1090 1091 #undef __FUNC__ 1092 #define __FUNC__ "MatMult_SeqBAIJ_N" 1093 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1094 { 1095 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1096 register Scalar *x,*z,*v,*xb; 1097 int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2; 1098 int _One = 1,ncols,k; 1099 Scalar _DOne = 1.0,*work,*workt,_DZero = 0.0; 1100 1101 VecGetArray_Fast(xx,x); 1102 VecGetArray_Fast(zz,z); 1103 1104 idx = a->j; 1105 v = a->a; 1106 ii = a->i; 1107 1108 1109 if (!a->mult_work) { 1110 k = PetscMax(a->m,a->n); 1111 a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 1112 } 1113 work = a->mult_work; 1114 for ( i=0; i<mbs; i++ ) { 1115 n = ii[1] - ii[0]; ii++; 1116 ncols = n*bs; 1117 workt = work; 1118 for ( j=0; j<n; j++ ) { 1119 xb = x + bs*(*idx++); 1120 for ( k=0; k<bs; k++ ) workt[k] = xb[k]; 1121 workt += bs; 1122 } 1123 LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); 1124 v += n*bs2; 1125 z += bs; 1126 } 1127 VecRestoreArray_Fast(xx,x); 1128 VecRestoreArray_Fast(zz,z); 1129 PLogFlops(2*a->nz*bs2 - a->m); 1130 return 0; 1131 } 1132 1133 #undef __FUNC__ 1134 #define __FUNC__ "MatMultAdd_SeqBAIJ_1" 1135 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1136 { 1137 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1138 register Scalar *x,*y,*z,*v,sum; 1139 int mbs=a->mbs,i,*idx,*ii,n; 1140 1141 VecGetArray_Fast(xx,x); 1142 VecGetArray_Fast(yy,y); 1143 VecGetArray_Fast(zz,z); 1144 1145 idx = a->j; 1146 v = a->a; 1147 ii = a->i; 1148 1149 for ( i=0; i<mbs; i++ ) { 1150 n = ii[1] - ii[0]; ii++; 1151 sum = y[i]; 1152 while (n--) sum += *v++ * x[*idx++]; 1153 z[i] = sum; 1154 } 1155 VecRestoreArray_Fast(xx,x); 1156 VecRestoreArray_Fast(yy,y); 1157 VecRestoreArray_Fast(zz,z); 1158 PLogFlops(2*a->nz); 1159 return 0; 1160 } 1161 1162 #undef __FUNC__ 1163 #define __FUNC__ "MatMultAdd_SeqBAIJ_2" 1164 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1165 { 1166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1167 register Scalar *x,*y,*z,*v,*xb,sum1,sum2; 1168 register Scalar x1,x2; 1169 int mbs=a->mbs,i,*idx,*ii; 1170 int j,n; 1171 1172 VecGetArray_Fast(xx,x); 1173 VecGetArray_Fast(yy,y); 1174 VecGetArray_Fast(zz,z); 1175 1176 idx = a->j; 1177 v = a->a; 1178 ii = a->i; 1179 1180 for ( i=0; i<mbs; i++ ) { 1181 n = ii[1] - ii[0]; ii++; 1182 sum1 = y[0]; sum2 = y[1]; 1183 for ( j=0; j<n; j++ ) { 1184 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1185 sum1 += v[0]*x1 + v[2]*x2; 1186 sum2 += v[1]*x1 + v[3]*x2; 1187 v += 4; 1188 } 1189 z[0] = sum1; z[1] = sum2; 1190 z += 2; y += 2; 1191 } 1192 VecRestoreArray_Fast(xx,x); 1193 VecRestoreArray_Fast(yy,y); 1194 VecRestoreArray_Fast(zz,z); 1195 PLogFlops(4*a->nz); 1196 return 0; 1197 } 1198 1199 #undef __FUNC__ 1200 #define __FUNC__ "MatMultAdd_SeqBAIJ_3" 1201 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1202 { 1203 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1204 register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3; 1205 int mbs=a->mbs,i,*idx,*ii,j,n; 1206 1207 VecGetArray_Fast(xx,x); 1208 VecGetArray_Fast(yy,y); 1209 VecGetArray_Fast(zz,z); 1210 1211 idx = a->j; 1212 v = a->a; 1213 ii = a->i; 1214 1215 for ( i=0; i<mbs; i++ ) { 1216 n = ii[1] - ii[0]; ii++; 1217 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1218 for ( j=0; j<n; j++ ) { 1219 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1220 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1221 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1222 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1223 v += 9; 1224 } 1225 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1226 z += 3; y += 3; 1227 } 1228 VecRestoreArray_Fast(xx,x); 1229 VecRestoreArray_Fast(yy,y); 1230 VecRestoreArray_Fast(zz,z); 1231 PLogFlops(18*a->nz); 1232 return 0; 1233 } 1234 1235 #undef __FUNC__ 1236 #define __FUNC__ "MatMultAdd_SeqBAIJ_4" 1237 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1238 { 1239 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1240 register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4; 1241 register Scalar x1,x2,x3,x4; 1242 int mbs=a->mbs,i,*idx,*ii; 1243 int j,n; 1244 1245 VecGetArray_Fast(xx,x); 1246 VecGetArray_Fast(yy,y); 1247 VecGetArray_Fast(zz,z); 1248 1249 idx = a->j; 1250 v = a->a; 1251 ii = a->i; 1252 1253 for ( i=0; i<mbs; i++ ) { 1254 n = ii[1] - ii[0]; ii++; 1255 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1256 for ( j=0; j<n; j++ ) { 1257 xb = x + 4*(*idx++); 1258 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1259 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1260 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1261 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1262 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1263 v += 16; 1264 } 1265 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1266 z += 4; y += 4; 1267 } 1268 VecRestoreArray_Fast(xx,x); 1269 VecRestoreArray_Fast(yy,y); 1270 VecRestoreArray_Fast(zz,z); 1271 PLogFlops(32*a->nz); 1272 return 0; 1273 } 1274 1275 #undef __FUNC__ 1276 #define __FUNC__ "MatMultAdd_SeqBAIJ_5" 1277 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1278 { 1279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1280 register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5; 1281 register Scalar x1,x2,x3,x4,x5; 1282 int mbs=a->mbs,i,*idx,*ii,j,n; 1283 1284 VecGetArray_Fast(xx,x); 1285 VecGetArray_Fast(yy,y); 1286 VecGetArray_Fast(zz,z); 1287 1288 idx = a->j; 1289 v = a->a; 1290 ii = a->i; 1291 1292 for ( i=0; i<mbs; i++ ) { 1293 n = ii[1] - ii[0]; ii++; 1294 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1295 for ( j=0; j<n; j++ ) { 1296 xb = x + 5*(*idx++); 1297 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1298 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1299 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1300 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1301 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1302 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1303 v += 25; 1304 } 1305 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1306 z += 5; y += 5; 1307 } 1308 VecRestoreArray_Fast(xx,x); 1309 VecRestoreArray_Fast(yy,y); 1310 VecRestoreArray_Fast(zz,z); 1311 PLogFlops(50*a->nz); 1312 return 0; 1313 } 1314 1315 #undef __FUNC__ 1316 #define __FUNC__ "MatMultAdd_SeqBAIJ_7" 1317 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1318 { 1319 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1320 register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1321 register Scalar x1,x2,x3,x4,x5,x6,x7; 1322 int mbs=a->mbs,i,*idx,*ii,j,n; 1323 1324 VecGetArray_Fast(xx,x); 1325 VecGetArray_Fast(yy,y); 1326 VecGetArray_Fast(zz,z); 1327 1328 idx = a->j; 1329 v = a->a; 1330 ii = a->i; 1331 1332 for ( i=0; i<mbs; i++ ) { 1333 n = ii[1] - ii[0]; ii++; 1334 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1335 for ( j=0; j<n; j++ ) { 1336 xb = x + 7*(*idx++); 1337 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1338 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1339 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1340 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1341 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1342 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1343 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1344 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1345 v += 49; 1346 } 1347 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1348 z += 7; y += 7; 1349 } 1350 VecRestoreArray_Fast(xx,x); 1351 VecRestoreArray_Fast(yy,y); 1352 VecRestoreArray_Fast(zz,z); 1353 PLogFlops(98*a->nz); 1354 return 0; 1355 } 1356 1357 1358 #undef __FUNC__ 1359 #define __FUNC__ "MatMultAdd_SeqBAIJ_N" 1360 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1361 { 1362 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1363 register Scalar *x,*z,*v,*xb; 1364 int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr; 1365 int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 1366 1367 if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); } 1368 1369 VecGetArray_Fast(xx,x); 1370 VecGetArray_Fast(zz,z); 1371 1372 idx = a->j; 1373 v = a->a; 1374 ii = a->i; 1375 1376 1377 if (!a->mult_work) { 1378 k = PetscMax(a->m,a->n); 1379 a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work); 1380 } 1381 work = a->mult_work; 1382 for ( i=0; i<mbs; i++ ) { 1383 n = ii[1] - ii[0]; ii++; 1384 ncols = n*bs; 1385 workt = work; 1386 for ( j=0; j<n; j++ ) { 1387 xb = x + bs*(*idx++); 1388 for ( k=0; k<bs; k++ ) workt[k] = xb[k]; 1389 workt += bs; 1390 } 1391 LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); 1392 v += n*bs2; 1393 z += bs; 1394 } 1395 VecRestoreArray_Fast(xx,x); 1396 VecRestoreArray_Fast(zz,z); 1397 PLogFlops(2*a->nz*bs2 ); 1398 return 0; 1399 } 1400 1401 #undef __FUNC__ 1402 #define __FUNC__ "MatMultTrans_SeqBAIJ" 1403 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz) 1404 { 1405 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1406 Scalar *xg,*zg,*zb; 1407 register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5; 1408 int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n; 1409 int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 1410 1411 1412 VecGetArray_Fast(xx,xg); x = xg; 1413 VecGetArray_Fast(zz,zg); z = zg; 1414 PetscMemzero(z,N*sizeof(Scalar)); 1415 1416 idx = a->j; 1417 v = a->a; 1418 ii = a->i; 1419 1420 switch (bs) { 1421 case 1: 1422 for ( i=0; i<mbs; i++ ) { 1423 n = ii[1] - ii[0]; ii++; 1424 xb = x + i; x1 = xb[0]; 1425 ib = idx + ai[i]; 1426 for ( j=0; j<n; j++ ) { 1427 rval = ib[j]; 1428 z[rval] += *v++ * x1; 1429 } 1430 } 1431 break; 1432 case 2: 1433 for ( i=0; i<mbs; i++ ) { 1434 n = ii[1] - ii[0]; ii++; 1435 xb = x + 2*i; x1 = xb[0]; x2 = xb[1]; 1436 ib = idx + ai[i]; 1437 for ( j=0; j<n; j++ ) { 1438 rval = ib[j]*2; 1439 z[rval++] += v[0]*x1 + v[1]*x2; 1440 z[rval++] += v[2]*x1 + v[3]*x2; 1441 v += 4; 1442 } 1443 } 1444 break; 1445 case 3: 1446 for ( i=0; i<mbs; i++ ) { 1447 n = ii[1] - ii[0]; ii++; 1448 xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1449 ib = idx + ai[i]; 1450 for ( j=0; j<n; j++ ) { 1451 rval = ib[j]*3; 1452 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1453 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1454 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1455 v += 9; 1456 } 1457 } 1458 break; 1459 case 4: 1460 for ( i=0; i<mbs; i++ ) { 1461 n = ii[1] - ii[0]; ii++; 1462 xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1463 ib = idx + ai[i]; 1464 for ( j=0; j<n; j++ ) { 1465 rval = ib[j]*4; 1466 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1467 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1468 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1469 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1470 v += 16; 1471 } 1472 } 1473 break; 1474 case 5: 1475 for ( i=0; i<mbs; i++ ) { 1476 n = ii[1] - ii[0]; ii++; 1477 xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1478 x4 = xb[3]; x5 = xb[4]; 1479 ib = idx + ai[i]; 1480 for ( j=0; j<n; j++ ) { 1481 rval = ib[j]*5; 1482 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1483 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1484 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1485 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1486 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1487 v += 25; 1488 } 1489 } 1490 break; 1491 /* block sizes larger then 5 by 5 are handled by BLAS */ 1492 default: { 1493 int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 1494 if (!a->mult_work) { 1495 k = PetscMax(a->m,a->n); 1496 a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar)); 1497 CHKPTRQ(a->mult_work); 1498 } 1499 work = a->mult_work; 1500 for ( i=0; i<mbs; i++ ) { 1501 n = ii[1] - ii[0]; ii++; 1502 ncols = n*bs; 1503 PetscMemzero(work,ncols*sizeof(Scalar)); 1504 LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); 1505 v += n*bs2; 1506 x += bs; 1507 workt = work; 1508 for ( j=0; j<n; j++ ) { 1509 zb = z + bs*(*idx++); 1510 for ( k=0; k<bs; k++ ) zb[k] += workt[k] ; 1511 workt += bs; 1512 } 1513 } 1514 } 1515 } 1516 ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr); 1517 ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr); 1518 PLogFlops(2*a->nz*a->bs2 - a->n); 1519 return 0; 1520 } 1521 1522 #undef __FUNC__ 1523 #define __FUNC__ "MatMultTransAdd_SeqBAIJ" 1524 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1525 { 1526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1527 Scalar *xg,*zg,*zb; 1528 register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5; 1529 int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n; 1530 int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 1531 1532 1533 1534 VecGetArray_Fast(xx,xg); x = xg; 1535 VecGetArray_Fast(zz,zg); z = zg; 1536 1537 if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); } 1538 else PetscMemzero(z,N*sizeof(Scalar)); 1539 1540 1541 idx = a->j; 1542 v = a->a; 1543 ii = a->i; 1544 1545 switch (bs) { 1546 case 1: 1547 for ( i=0; i<mbs; i++ ) { 1548 n = ii[1] - ii[0]; ii++; 1549 xb = x + i; x1 = xb[0]; 1550 ib = idx + ai[i]; 1551 for ( j=0; j<n; j++ ) { 1552 rval = ib[j]; 1553 z[rval] += *v++ * x1; 1554 } 1555 } 1556 break; 1557 case 2: 1558 for ( i=0; i<mbs; i++ ) { 1559 n = ii[1] - ii[0]; ii++; 1560 xb = x + 2*i; x1 = xb[0]; x2 = xb[1]; 1561 ib = idx + ai[i]; 1562 for ( j=0; j<n; j++ ) { 1563 rval = ib[j]*2; 1564 z[rval++] += v[0]*x1 + v[1]*x2; 1565 z[rval++] += v[2]*x1 + v[3]*x2; 1566 v += 4; 1567 } 1568 } 1569 break; 1570 case 3: 1571 for ( i=0; i<mbs; i++ ) { 1572 n = ii[1] - ii[0]; ii++; 1573 xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1574 ib = idx + ai[i]; 1575 for ( j=0; j<n; j++ ) { 1576 rval = ib[j]*3; 1577 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1578 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1579 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1580 v += 9; 1581 } 1582 } 1583 break; 1584 case 4: 1585 for ( i=0; i<mbs; i++ ) { 1586 n = ii[1] - ii[0]; ii++; 1587 xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1588 ib = idx + ai[i]; 1589 for ( j=0; j<n; j++ ) { 1590 rval = ib[j]*4; 1591 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1592 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1593 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1594 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1595 v += 16; 1596 } 1597 } 1598 break; 1599 case 5: 1600 for ( i=0; i<mbs; i++ ) { 1601 n = ii[1] - ii[0]; ii++; 1602 xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1603 x4 = xb[3]; x5 = xb[4]; 1604 ib = idx + ai[i]; 1605 for ( j=0; j<n; j++ ) { 1606 rval = ib[j]*5; 1607 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1608 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1609 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1610 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1611 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1612 v += 25; 1613 } 1614 } 1615 break; 1616 /* block sizes larger then 5 by 5 are handled by BLAS */ 1617 default: { 1618 int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 1619 if (!a->mult_work) { 1620 k = PetscMax(a->m,a->n); 1621 a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar)); 1622 CHKPTRQ(a->mult_work); 1623 } 1624 work = a->mult_work; 1625 for ( i=0; i<mbs; i++ ) { 1626 n = ii[1] - ii[0]; ii++; 1627 ncols = n*bs; 1628 PetscMemzero(work,ncols*sizeof(Scalar)); 1629 LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); 1630 v += n*bs2; 1631 x += bs; 1632 workt = work; 1633 for ( j=0; j<n; j++ ) { 1634 zb = z + bs*(*idx++); 1635 for ( k=0; k<bs; k++ ) zb[k] += workt[k] ; 1636 workt += bs; 1637 } 1638 } 1639 } 1640 } 1641 ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr); 1642 ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr); 1643 PLogFlops(2*a->nz*a->bs2); 1644 return 0; 1645 } 1646 1647 #undef __FUNC__ 1648 #define __FUNC__ "MatGetInfo_SeqBAIJ" 1649 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1650 { 1651 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1652 1653 info->rows_global = (double)a->m; 1654 info->columns_global = (double)a->n; 1655 info->rows_local = (double)a->m; 1656 info->columns_local = (double)a->n; 1657 info->block_size = a->bs2; 1658 info->nz_allocated = a->maxnz; 1659 info->nz_used = a->bs2*a->nz; 1660 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1661 /* if (info->nz_unneeded != A->info.nz_unneeded) 1662 printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */ 1663 info->assemblies = A->num_ass; 1664 info->mallocs = a->reallocs; 1665 info->memory = A->mem; 1666 if (A->factor) { 1667 info->fill_ratio_given = A->info.fill_ratio_given; 1668 info->fill_ratio_needed = A->info.fill_ratio_needed; 1669 info->factor_mallocs = A->info.factor_mallocs; 1670 } else { 1671 info->fill_ratio_given = 0; 1672 info->fill_ratio_needed = 0; 1673 info->factor_mallocs = 0; 1674 } 1675 return 0; 1676 } 1677 1678 #undef __FUNC__ 1679 #define __FUNC__ "MatEqual_SeqBAIJ" 1680 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg) 1681 { 1682 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 1683 1684 if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type"); 1685 1686 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1687 if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| 1688 (a->nz != b->nz)) { 1689 *flg = PETSC_FALSE; return 0; 1690 } 1691 1692 /* if the a->i are the same */ 1693 if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) { 1694 *flg = PETSC_FALSE; return 0; 1695 } 1696 1697 /* if a->j are the same */ 1698 if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) { 1699 *flg = PETSC_FALSE; return 0; 1700 } 1701 1702 /* if a->a are the same */ 1703 if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) { 1704 *flg = PETSC_FALSE; return 0; 1705 } 1706 *flg = PETSC_TRUE; 1707 return 0; 1708 1709 } 1710 1711 #undef __FUNC__ 1712 #define __FUNC__ "MatGetDiagonal_SeqBAIJ" 1713 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1714 { 1715 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1716 int i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1717 Scalar *x, zero = 0.0,*aa,*aa_j; 1718 1719 if (A->factor) SETERRQ(1,0,"Not for factored matrix"); 1720 bs = a->bs; 1721 aa = a->a; 1722 ai = a->i; 1723 aj = a->j; 1724 ambs = a->mbs; 1725 bs2 = a->bs2; 1726 1727 VecSet(&zero,v); 1728 VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n); 1729 if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector"); 1730 for ( i=0; i<ambs; i++ ) { 1731 for ( j=ai[i]; j<ai[i+1]; j++ ) { 1732 if (aj[j] == i) { 1733 row = i*bs; 1734 aa_j = aa+j*bs2; 1735 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1736 break; 1737 } 1738 } 1739 } 1740 return 0; 1741 } 1742 1743 #undef __FUNC__ 1744 #define __FUNC__ "MatDiagonalScale_SeqBAIJ" 1745 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 1746 { 1747 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1748 Scalar *l,*r,x,*v,*aa,*li,*ri; 1749 int i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 1750 1751 ai = a->i; 1752 aj = a->j; 1753 aa = a->a; 1754 m = a->m; 1755 n = a->n; 1756 bs = a->bs; 1757 mbs = a->mbs; 1758 bs2 = a->bs2; 1759 if (ll) { 1760 VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm); 1761 if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length"); 1762 for ( i=0; i<mbs; i++ ) { /* for each block row */ 1763 M = ai[i+1] - ai[i]; 1764 li = l + i*bs; 1765 v = aa + bs2*ai[i]; 1766 for ( j=0; j<M; j++ ) { /* for each block */ 1767 for ( k=0; k<bs2; k++ ) { 1768 (*v++) *= li[k%bs]; 1769 } 1770 } 1771 } 1772 } 1773 1774 if (rr) { 1775 VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn); 1776 if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length"); 1777 for ( i=0; i<mbs; i++ ) { /* for each block row */ 1778 M = ai[i+1] - ai[i]; 1779 v = aa + bs2*ai[i]; 1780 for ( j=0; j<M; j++ ) { /* for each block */ 1781 ri = r + bs*aj[ai[i]+j]; 1782 for ( k=0; k<bs; k++ ) { 1783 x = ri[k]; 1784 for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x; 1785 } 1786 } 1787 } 1788 } 1789 return 0; 1790 } 1791 1792 1793 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*); 1794 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double); 1795 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int); 1796 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*); 1797 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**); 1798 1799 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec); 1800 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec); 1801 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec); 1802 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec); 1803 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec); 1804 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec); 1805 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec); 1806 1807 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*); 1808 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*); 1809 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*); 1810 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*); 1811 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*); 1812 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*); 1813 1814 #undef __FUNC__ 1815 #define __FUNC__ "MatNorm_SeqBAIJ" 1816 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm) 1817 { 1818 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1819 Scalar *v = a->a; 1820 double sum = 0.0; 1821 int i,nz=a->nz,bs2=a->bs2; 1822 1823 if (type == NORM_FROBENIUS) { 1824 for (i=0; i< bs2*nz; i++ ) { 1825 #if defined(PETSC_COMPLEX) 1826 sum += real(conj(*v)*(*v)); v++; 1827 #else 1828 sum += (*v)*(*v); v++; 1829 #endif 1830 } 1831 *norm = sqrt(sum); 1832 } 1833 else { 1834 SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet"); 1835 } 1836 return 0; 1837 } 1838 1839 extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec); 1840 /* 1841 note: This can only work for identity for row and col. It would 1842 be good to check this and otherwise generate an error. 1843 */ 1844 #undef __FUNC__ 1845 #define __FUNC__ "MatILUFactor_SeqBAIJ" 1846 int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill) 1847 { 1848 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data; 1849 Mat outA; 1850 int ierr; 1851 1852 if (fill != 0) SETERRQ(1,0,"Only fill=0 supported"); 1853 1854 outA = inA; 1855 inA->factor = FACTOR_LU; 1856 a->row = row; 1857 a->col = col; 1858 1859 if (!a->solve_work) { 1860 a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work); 1861 PLogObjectMemory(inA,(a->m+a->bs)*sizeof(Scalar)); 1862 } 1863 1864 if (!a->diag) { 1865 ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr); 1866 } 1867 ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr); 1868 1869 /* 1870 Blocksize 4 has a special faster solver for ILU(0) factorization 1871 with natural ordering 1872 */ 1873 if (a->bs == 4) { 1874 inA->ops.solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 1875 } 1876 1877 return 0; 1878 } 1879 1880 #undef __FUNC__ 1881 #define __FUNC__ "MatScale_SeqBAIJ" 1882 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA) 1883 { 1884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data; 1885 int one = 1, totalnz = a->bs2*a->nz; 1886 BLscal_( &totalnz, alpha, a->a, &one ); 1887 PLogFlops(totalnz); 1888 return 0; 1889 } 1890 1891 #undef __FUNC__ 1892 #define __FUNC__ "MatGetValues_SeqBAIJ" 1893 int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v) 1894 { 1895 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1896 int *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j; 1897 int *ai = a->i, *ailen = a->ilen; 1898 int brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2; 1899 Scalar *ap, *aa = a->a, zero = 0.0; 1900 1901 for ( k=0; k<m; k++ ) { /* loop over rows */ 1902 row = im[k]; brow = row/bs; 1903 if (row < 0) SETERRQ(1,0,"Negative row"); 1904 if (row >= a->m) SETERRQ(1,0,"Row too large"); 1905 rp = aj + ai[brow] ; ap = aa + bs2*ai[brow] ; 1906 nrow = ailen[brow]; 1907 for ( l=0; l<n; l++ ) { /* loop over columns */ 1908 if (in[l] < 0) SETERRQ(1,0,"Negative column"); 1909 if (in[l] >= a->n) SETERRQ(1,0,"Column too large"); 1910 col = in[l] ; 1911 bcol = col/bs; 1912 cidx = col%bs; 1913 ridx = row%bs; 1914 high = nrow; 1915 low = 0; /* assume unsorted */ 1916 while (high-low > 5) { 1917 t = (low+high)/2; 1918 if (rp[t] > bcol) high = t; 1919 else low = t; 1920 } 1921 for ( i=low; i<high; i++ ) { 1922 if (rp[i] > bcol) break; 1923 if (rp[i] == bcol) { 1924 *v++ = ap[bs2*i+bs*cidx+ridx]; 1925 goto finished; 1926 } 1927 } 1928 *v++ = zero; 1929 finished:; 1930 } 1931 } 1932 return 0; 1933 } 1934 1935 #undef __FUNC__ 1936 #define __FUNC__ "MatGetBlockSize_SeqBAIJ" 1937 int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs) 1938 { 1939 Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data; 1940 *bs = baij->bs; 1941 return 0; 1942 } 1943 1944 /* idx should be of length atlease bs */ 1945 #undef __FUNC__ 1946 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block" 1947 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg) 1948 { 1949 int i,row; 1950 row = idx[0]; 1951 if (row%bs!=0) { *flg = PETSC_FALSE; return 0; } 1952 1953 for ( i=1; i<bs; i++ ) { 1954 if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; } 1955 } 1956 *flg = PETSC_TRUE; 1957 return 0; 1958 } 1959 1960 #undef __FUNC__ 1961 #define __FUNC__ "MatZeroRows_SeqBAIJ" 1962 int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag) 1963 { 1964 Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data; 1965 IS is_local; 1966 int ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2; 1967 PetscTruth flg; 1968 Scalar zero = 0.0,*aa; 1969 1970 /* Make a copy of the IS and sort it */ 1971 ierr = ISGetSize(is,&is_n);CHKERRQ(ierr); 1972 ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr); 1973 ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr); 1974 ierr = ISSort(is_local); CHKERRQ(ierr); 1975 ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr); 1976 1977 i = 0; 1978 while (i < is_n) { 1979 if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range"); 1980 flg = PETSC_FALSE; 1981 if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); } 1982 if (flg) { /* There exists a block of rows to be Zerowed */ 1983 baij->ilen[rows[i]/bs] = 0; 1984 i += bs; 1985 } else { /* Zero out only the requested row */ 1986 count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs; 1987 aa = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs); 1988 for ( j=0; j<count; j++ ) { 1989 aa[0] = zero; 1990 aa+=bs; 1991 } 1992 i++; 1993 } 1994 } 1995 if (diag) { 1996 for ( j=0; j<is_n; j++ ) { 1997 ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr); 1998 } 1999 } 2000 ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr); 2001 ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr); 2002 ierr = ISDestroy(is_local); CHKERRQ(ierr); 2003 ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 2004 2005 return 0; 2006 } 2007 2008 #undef __FUNC__ 2009 #define __FUNC__ "MatPrintHelp_SeqBAIJ" 2010 int MatPrintHelp_SeqBAIJ(Mat A) 2011 { 2012 static int called = 0; 2013 MPI_Comm comm = A->comm; 2014 2015 if (called) return 0; else called = 1; 2016 PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n"); 2017 PetscPrintf(comm," -mat_block_size <block_size>\n"); 2018 return 0; 2019 } 2020 2021 /* -------------------------------------------------------------------*/ 2022 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ, 2023 MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ, 2024 MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N, 2025 MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ, 2026 MatSolve_SeqBAIJ_N,0, 2027 0,0, 2028 MatLUFactor_SeqBAIJ,0, 2029 0, 2030 MatTranspose_SeqBAIJ, 2031 MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ, 2032 MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ, 2033 0,MatAssemblyEnd_SeqBAIJ, 2034 0, 2035 MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ, 2036 MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0, 2037 MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ, 2038 MatILUFactorSymbolic_SeqBAIJ,0, 2039 0,0, 2040 MatConvertSameType_SeqBAIJ,0,0, 2041 MatILUFactor_SeqBAIJ,0,0, 2042 MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ, 2043 MatGetValues_SeqBAIJ,0, 2044 MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ, 2045 0,0,0,MatGetBlockSize_SeqBAIJ, 2046 MatGetRowIJ_SeqBAIJ, 2047 MatRestoreRowIJ_SeqBAIJ, 2048 0,0,0,0,0,0, 2049 MatSetValuesBlocked_SeqBAIJ}; 2050 2051 #undef __FUNC__ 2052 #define __FUNC__ "MatCreateSeqBAIJ" 2053 /*@C 2054 MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block 2055 compressed row) format. For good matrix assembly performance the 2056 user should preallocate the matrix storage by setting the parameter nz 2057 (or the array nzz). By setting these parameters accurately, performance 2058 during matrix assembly can be increased by more than a factor of 50. 2059 2060 Input Parameters: 2061 . comm - MPI communicator, set to PETSC_COMM_SELF 2062 . bs - size of block 2063 . m - number of rows 2064 . n - number of columns 2065 . nz - number of block nonzeros per block row (same for all rows) 2066 . nzz - array containing the number of block nonzeros in the various block rows 2067 (possibly different for each block row) or PETSC_NULL 2068 2069 Output Parameter: 2070 . A - the matrix 2071 2072 Options Database Keys: 2073 $ -mat_no_unroll - uses code that does not unroll the loops in the 2074 $ block calculations (much slower) 2075 $ -mat_block_size - size of the blocks to use 2076 2077 Notes: 2078 The block AIJ format is fully compatible with standard Fortran 77 2079 storage. That is, the stored row and column indices can begin at 2080 either one (as in Fortran) or zero. See the users' manual for details. 2081 2082 Specify the preallocated storage with either nz or nnz (not both). 2083 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 2084 allocation. For additional details, see the users manual chapter on 2085 matrices. 2086 2087 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues() 2088 @*/ 2089 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A) 2090 { 2091 Mat B; 2092 Mat_SeqBAIJ *b; 2093 int i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size; 2094 2095 MPI_Comm_size(comm,&size); 2096 if (size > 1) SETERRQ(1,0,"Comm must be of size 1"); 2097 2098 ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr); 2099 2100 if (mbs*bs!=m || nbs*bs!=n) 2101 SETERRQ(1,0,"Number rows, cols must be divisible by blocksize"); 2102 2103 *A = 0; 2104 PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATSEQBAIJ,comm,MatDestroy,MatView); 2105 PLogObjectCreate(B); 2106 B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b); 2107 PetscMemzero(b,sizeof(Mat_SeqBAIJ)); 2108 PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps)); 2109 ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr); 2110 if (!flg) { 2111 switch (bs) { 2112 case 1: 2113 B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 2114 B->ops.solve = MatSolve_SeqBAIJ_1; 2115 B->ops.mult = MatMult_SeqBAIJ_1; 2116 B->ops.multadd = MatMultAdd_SeqBAIJ_1; 2117 break; 2118 case 2: 2119 B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2; 2120 B->ops.solve = MatSolve_SeqBAIJ_2; 2121 B->ops.mult = MatMult_SeqBAIJ_2; 2122 B->ops.multadd = MatMultAdd_SeqBAIJ_2; 2123 break; 2124 case 3: 2125 B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3; 2126 B->ops.solve = MatSolve_SeqBAIJ_3; 2127 B->ops.mult = MatMult_SeqBAIJ_3; 2128 B->ops.multadd = MatMultAdd_SeqBAIJ_3; 2129 break; 2130 case 4: 2131 B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4; 2132 B->ops.solve = MatSolve_SeqBAIJ_4; 2133 B->ops.mult = MatMult_SeqBAIJ_4; 2134 B->ops.multadd = MatMultAdd_SeqBAIJ_4; 2135 break; 2136 case 5: 2137 B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5; 2138 B->ops.solve = MatSolve_SeqBAIJ_5; 2139 B->ops.mult = MatMult_SeqBAIJ_5; 2140 B->ops.multadd = MatMultAdd_SeqBAIJ_5; 2141 break; 2142 case 7: 2143 B->ops.mult = MatMult_SeqBAIJ_7; 2144 B->ops.solve = MatSolve_SeqBAIJ_7; 2145 B->ops.multadd = MatMultAdd_SeqBAIJ_7; 2146 break; 2147 } 2148 } 2149 B->destroy = MatDestroy_SeqBAIJ; 2150 B->view = MatView_SeqBAIJ; 2151 B->factor = 0; 2152 B->lupivotthreshold = 1.0; 2153 B->mapping = 0; 2154 b->row = 0; 2155 b->col = 0; 2156 b->reallocs = 0; 2157 2158 b->m = m; B->m = m; B->M = m; 2159 b->n = n; B->n = n; B->N = n; 2160 b->mbs = mbs; 2161 b->nbs = nbs; 2162 b->imax = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax); 2163 if (nnz == PETSC_NULL) { 2164 if (nz == PETSC_DEFAULT) nz = 5; 2165 else if (nz <= 0) nz = 1; 2166 for ( i=0; i<mbs; i++ ) b->imax[i] = nz; 2167 nz = nz*mbs; 2168 } 2169 else { 2170 nz = 0; 2171 for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];} 2172 } 2173 2174 /* allocate the matrix space */ 2175 len = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int); 2176 b->a = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a); 2177 PetscMemzero(b->a,nz*bs2*sizeof(Scalar)); 2178 b->j = (int *) (b->a + nz*bs2); 2179 PetscMemzero(b->j,nz*sizeof(int)); 2180 b->i = b->j + nz; 2181 b->singlemalloc = 1; 2182 2183 b->i[0] = 0; 2184 for (i=1; i<mbs+1; i++) { 2185 b->i[i] = b->i[i-1] + b->imax[i-1]; 2186 } 2187 2188 /* b->ilen will count nonzeros in each block row so far. */ 2189 b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int)); 2190 PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ)); 2191 for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;} 2192 2193 b->bs = bs; 2194 b->bs2 = bs2; 2195 b->mbs = mbs; 2196 b->nz = 0; 2197 b->maxnz = nz*bs2; 2198 b->sorted = 0; 2199 b->roworiented = 1; 2200 b->nonew = 0; 2201 b->diag = 0; 2202 b->solve_work = 0; 2203 b->mult_work = 0; 2204 b->spptr = 0; 2205 B->info.nz_unneeded = (double)b->maxnz; 2206 2207 *A = B; 2208 ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr); 2209 if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); } 2210 return 0; 2211 } 2212 2213 #undef __FUNC__ 2214 #define __FUNC__ "MatConvertSameType_SeqBAIJ" 2215 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues) 2216 { 2217 Mat C; 2218 Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data; 2219 int i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2; 2220 2221 if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix"); 2222 2223 *B = 0; 2224 PetscHeaderCreate(C,_p_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm,MatDestroy,MatView); 2225 PLogObjectCreate(C); 2226 C->data = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c); 2227 PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps)); 2228 C->destroy = MatDestroy_SeqBAIJ; 2229 C->view = MatView_SeqBAIJ; 2230 C->factor = A->factor; 2231 c->row = 0; 2232 c->col = 0; 2233 C->assembled = PETSC_TRUE; 2234 2235 c->m = C->m = a->m; 2236 c->n = C->n = a->n; 2237 C->M = a->m; 2238 C->N = a->n; 2239 2240 c->bs = a->bs; 2241 c->bs2 = a->bs2; 2242 c->mbs = a->mbs; 2243 c->nbs = a->nbs; 2244 2245 c->imax = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax); 2246 c->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen); 2247 for ( i=0; i<mbs; i++ ) { 2248 c->imax[i] = a->imax[i]; 2249 c->ilen[i] = a->ilen[i]; 2250 } 2251 2252 /* allocate the matrix space */ 2253 c->singlemalloc = 1; 2254 len = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int)); 2255 c->a = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a); 2256 c->j = (int *) (c->a + nz*bs2); 2257 c->i = c->j + nz; 2258 PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int)); 2259 if (mbs > 0) { 2260 PetscMemcpy(c->j,a->j,nz*sizeof(int)); 2261 if (cpvalues == COPY_VALUES) { 2262 PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar)); 2263 } 2264 } 2265 2266 PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ)); 2267 c->sorted = a->sorted; 2268 c->roworiented = a->roworiented; 2269 c->nonew = a->nonew; 2270 2271 if (a->diag) { 2272 c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag); 2273 PLogObjectMemory(C,(mbs+1)*sizeof(int)); 2274 for ( i=0; i<mbs; i++ ) { 2275 c->diag[i] = a->diag[i]; 2276 } 2277 } 2278 else c->diag = 0; 2279 c->nz = a->nz; 2280 c->maxnz = a->maxnz; 2281 c->solve_work = 0; 2282 c->spptr = 0; /* Dangerous -I'm throwing away a->spptr */ 2283 c->mult_work = 0; 2284 *B = C; 2285 return 0; 2286 } 2287 2288 #undef __FUNC__ 2289 #define __FUNC__ "MatLoad_SeqBAIJ" 2290 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A) 2291 { 2292 Mat_SeqBAIJ *a; 2293 Mat B; 2294 int i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg; 2295 int *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount; 2296 int kmax,jcount,block,idx,point,nzcountb,extra_rows; 2297 int *masked, nmask,tmp,bs2,ishift; 2298 Scalar *aa; 2299 MPI_Comm comm = ((PetscObject) viewer)->comm; 2300 2301 ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr); 2302 bs2 = bs*bs; 2303 2304 MPI_Comm_size(comm,&size); 2305 if (size > 1) SETERRQ(1,0,"view must have one processor"); 2306 ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr); 2307 ierr = PetscBinaryRead(fd,header,4,PETSC_INT); CHKERRQ(ierr); 2308 if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object"); 2309 M = header[1]; N = header[2]; nz = header[3]; 2310 2311 if (M != N) SETERRQ(1,0,"Can only do square matrices"); 2312 2313 /* 2314 This code adds extra rows to make sure the number of rows is 2315 divisible by the blocksize 2316 */ 2317 mbs = M/bs; 2318 extra_rows = bs - M + bs*(mbs); 2319 if (extra_rows == bs) extra_rows = 0; 2320 else mbs++; 2321 if (extra_rows) { 2322 PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n"); 2323 } 2324 2325 /* read in row lengths */ 2326 rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths); 2327 ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr); 2328 for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1; 2329 2330 /* read in column indices */ 2331 jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj); 2332 ierr = PetscBinaryRead(fd,jj,nz,PETSC_INT); CHKERRQ(ierr); 2333 for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i; 2334 2335 /* loop over row lengths determining block row lengths */ 2336 browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths); 2337 PetscMemzero(browlengths,mbs*sizeof(int)); 2338 mask = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask); 2339 PetscMemzero(mask,mbs*sizeof(int)); 2340 masked = mask + mbs; 2341 rowcount = 0; nzcount = 0; 2342 for ( i=0; i<mbs; i++ ) { 2343 nmask = 0; 2344 for ( j=0; j<bs; j++ ) { 2345 kmax = rowlengths[rowcount]; 2346 for ( k=0; k<kmax; k++ ) { 2347 tmp = jj[nzcount++]/bs; 2348 if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;} 2349 } 2350 rowcount++; 2351 } 2352 browlengths[i] += nmask; 2353 /* zero out the mask elements we set */ 2354 for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0; 2355 } 2356 2357 /* create our matrix */ 2358 ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);CHKERRQ(ierr); 2359 B = *A; 2360 a = (Mat_SeqBAIJ *) B->data; 2361 2362 /* set matrix "i" values */ 2363 a->i[0] = 0; 2364 for ( i=1; i<= mbs; i++ ) { 2365 a->i[i] = a->i[i-1] + browlengths[i-1]; 2366 a->ilen[i-1] = browlengths[i-1]; 2367 } 2368 a->nz = 0; 2369 for ( i=0; i<mbs; i++ ) a->nz += browlengths[i]; 2370 2371 /* read in nonzero values */ 2372 aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa); 2373 ierr = PetscBinaryRead(fd,aa,nz,PETSC_SCALAR); CHKERRQ(ierr); 2374 for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0; 2375 2376 /* set "a" and "j" values into matrix */ 2377 nzcount = 0; jcount = 0; 2378 for ( i=0; i<mbs; i++ ) { 2379 nzcountb = nzcount; 2380 nmask = 0; 2381 for ( j=0; j<bs; j++ ) { 2382 kmax = rowlengths[i*bs+j]; 2383 for ( k=0; k<kmax; k++ ) { 2384 tmp = jj[nzcount++]/bs; 2385 if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;} 2386 } 2387 rowcount++; 2388 } 2389 /* sort the masked values */ 2390 PetscSortInt(nmask,masked); 2391 2392 /* set "j" values into matrix */ 2393 maskcount = 1; 2394 for ( j=0; j<nmask; j++ ) { 2395 a->j[jcount++] = masked[j]; 2396 mask[masked[j]] = maskcount++; 2397 } 2398 /* set "a" values into matrix */ 2399 ishift = bs2*a->i[i]; 2400 for ( j=0; j<bs; j++ ) { 2401 kmax = rowlengths[i*bs+j]; 2402 for ( k=0; k<kmax; k++ ) { 2403 tmp = jj[nzcountb]/bs ; 2404 block = mask[tmp] - 1; 2405 point = jj[nzcountb] - bs*tmp; 2406 idx = ishift + bs2*block + j + bs*point; 2407 a->a[idx] = aa[nzcountb++]; 2408 } 2409 } 2410 /* zero out the mask elements we set */ 2411 for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0; 2412 } 2413 if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix"); 2414 2415 PetscFree(rowlengths); 2416 PetscFree(browlengths); 2417 PetscFree(aa); 2418 PetscFree(jj); 2419 PetscFree(mask); 2420 2421 B->assembled = PETSC_TRUE; 2422 2423 ierr = MatView_Private(B); CHKERRQ(ierr); 2424 return 0; 2425 } 2426 2427 2428 2429