1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n; ++j) { 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 64 } 65 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 66 } 67 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 68 ierr = PetscFree(nidx);CHKERRQ(ierr); 69 ierr = PetscFree(nidx2);CHKERRQ(ierr); 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag; 87 88 PetscFunctionBegin; 89 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 145 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 146 for (i=0; i<nrows; i++) { 147 PetscInt ilen; 148 mat_i = c->i[i]; 149 mat_j = c->j + mat_i; 150 mat_a = c->a + mat_i*bs2; 151 ilen = c->ilen[i]; 152 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 153 } 154 ierr = PetscFree(work);CHKERRQ(ierr); 155 } 156 157 /* Free work space */ 158 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 159 ierr = PetscFree(smap);CHKERRQ(ierr); 160 ierr = PetscFree(lens);CHKERRQ(ierr); 161 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 163 164 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 165 *B = C; 166 PetscFunctionReturn(0); 167 } 168 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 171 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 172 { 173 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 174 IS is1,is2; 175 PetscErrorCode ierr; 176 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 177 const PetscInt *irow,*icol; 178 179 PetscFunctionBegin; 180 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 181 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 182 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 183 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 184 185 /* Verify if the indices corespond to each element in a block 186 and form the IS with compressed IS */ 187 maxmnbs = PetscMax(a->mbs,a->nbs); 188 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 189 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 190 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 191 for (i=0; i<a->mbs; i++) { 192 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 193 } 194 count = 0; 195 for (i=0; i<nrows; i++) { 196 PetscInt j = irow[i] / bs; 197 if ((vary[j]--)==bs) iary[count++] = j; 198 } 199 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 200 201 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 202 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 203 for (i=0; i<a->nbs; i++) { 204 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 205 } 206 count = 0; 207 for (i=0; i<ncols; i++) { 208 PetscInt j = icol[i] / bs; 209 if ((vary[j]--)==bs) iary[count++] = j; 210 } 211 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 213 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 214 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 215 216 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 217 ierr = ISDestroy(&is1);CHKERRQ(ierr); 218 ierr = ISDestroy(&is2);CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 #undef __FUNCT__ 223 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 224 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 225 { 226 PetscErrorCode ierr; 227 PetscInt i; 228 229 PetscFunctionBegin; 230 if (scall == MAT_INITIAL_MATRIX) { 231 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 232 } 233 234 for (i=0; i<n; i++) { 235 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 236 } 237 PetscFunctionReturn(0); 238 } 239 240 241 /* -------------------------------------------------------*/ 242 /* Should check that shapes of vectors and matrices match */ 243 /* -------------------------------------------------------*/ 244 245 #undef __FUNCT__ 246 #define __FUNCT__ "MatMult_SeqBAIJ_1" 247 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 248 { 249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 250 PetscScalar *z,sum; 251 const PetscScalar *x; 252 const MatScalar *v; 253 PetscErrorCode ierr; 254 PetscInt mbs,i,n; 255 const PetscInt *idx,*ii,*ridx=NULL; 256 PetscBool usecprow=a->compressedrow.use; 257 258 PetscFunctionBegin; 259 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 260 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 261 262 if (usecprow) { 263 mbs = a->compressedrow.nrows; 264 ii = a->compressedrow.i; 265 ridx = a->compressedrow.rindex; 266 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 267 } else { 268 mbs = a->mbs; 269 ii = a->i; 270 } 271 272 for (i=0; i<mbs; i++) { 273 n = ii[1] - ii[0]; 274 v = a->a + ii[0]; 275 idx = a->j + ii[0]; 276 ii++; 277 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 278 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 279 sum = 0.0; 280 PetscSparseDensePlusDot(sum,x,v,idx,n); 281 if (usecprow) { 282 z[ridx[i]] = sum; 283 } else { 284 z[i] = sum; 285 } 286 } 287 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 288 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 289 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 290 PetscFunctionReturn(0); 291 } 292 293 #undef __FUNCT__ 294 #define __FUNCT__ "MatMult_SeqBAIJ_2" 295 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 296 { 297 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 298 PetscScalar *z = 0,sum1,sum2,*zarray; 299 const PetscScalar *x,*xb; 300 PetscScalar x1,x2; 301 const MatScalar *v; 302 PetscErrorCode ierr; 303 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 304 PetscBool usecprow=a->compressedrow.use; 305 306 PetscFunctionBegin; 307 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 308 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 309 310 idx = a->j; 311 v = a->a; 312 if (usecprow) { 313 mbs = a->compressedrow.nrows; 314 ii = a->compressedrow.i; 315 ridx = a->compressedrow.rindex; 316 } else { 317 mbs = a->mbs; 318 ii = a->i; 319 z = zarray; 320 } 321 322 for (i=0; i<mbs; i++) { 323 n = ii[1] - ii[0]; ii++; 324 sum1 = 0.0; sum2 = 0.0; 325 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 326 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 327 for (j=0; j<n; j++) { 328 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 329 sum1 += v[0]*x1 + v[2]*x2; 330 sum2 += v[1]*x1 + v[3]*x2; 331 v += 4; 332 } 333 if (usecprow) z = zarray + 2*ridx[i]; 334 z[0] = sum1; z[1] = sum2; 335 if (!usecprow) z += 2; 336 } 337 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 338 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 339 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 340 PetscFunctionReturn(0); 341 } 342 343 #undef __FUNCT__ 344 #define __FUNCT__ "MatMult_SeqBAIJ_3" 345 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 346 { 347 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 348 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 349 const PetscScalar *x,*xb; 350 const MatScalar *v; 351 PetscErrorCode ierr; 352 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 353 PetscBool usecprow=a->compressedrow.use; 354 355 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 356 #pragma disjoint(*v,*z,*xb) 357 #endif 358 359 PetscFunctionBegin; 360 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 361 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 362 363 idx = a->j; 364 v = a->a; 365 if (usecprow) { 366 mbs = a->compressedrow.nrows; 367 ii = a->compressedrow.i; 368 ridx = a->compressedrow.rindex; 369 } else { 370 mbs = a->mbs; 371 ii = a->i; 372 z = zarray; 373 } 374 375 for (i=0; i<mbs; i++) { 376 n = ii[1] - ii[0]; ii++; 377 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 378 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 379 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 380 for (j=0; j<n; j++) { 381 xb = x + 3*(*idx++); 382 x1 = xb[0]; 383 x2 = xb[1]; 384 x3 = xb[2]; 385 386 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 387 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 388 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 389 v += 9; 390 } 391 if (usecprow) z = zarray + 3*ridx[i]; 392 z[0] = sum1; z[1] = sum2; z[2] = sum3; 393 if (!usecprow) z += 3; 394 } 395 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 396 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 397 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 398 PetscFunctionReturn(0); 399 } 400 401 #undef __FUNCT__ 402 #define __FUNCT__ "MatMult_SeqBAIJ_4" 403 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 404 { 405 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 406 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 407 const PetscScalar *x,*xb; 408 const MatScalar *v; 409 PetscErrorCode ierr; 410 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 411 PetscBool usecprow=a->compressedrow.use; 412 413 PetscFunctionBegin; 414 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 415 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 416 417 idx = a->j; 418 v = a->a; 419 if (usecprow) { 420 mbs = a->compressedrow.nrows; 421 ii = a->compressedrow.i; 422 ridx = a->compressedrow.rindex; 423 } else { 424 mbs = a->mbs; 425 ii = a->i; 426 z = zarray; 427 } 428 429 for (i=0; i<mbs; i++) { 430 n = ii[1] - ii[0]; 431 ii++; 432 sum1 = 0.0; 433 sum2 = 0.0; 434 sum3 = 0.0; 435 sum4 = 0.0; 436 437 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 438 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 439 for (j=0; j<n; j++) { 440 xb = x + 4*(*idx++); 441 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 442 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 443 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 444 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 445 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 446 v += 16; 447 } 448 if (usecprow) z = zarray + 4*ridx[i]; 449 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 450 if (!usecprow) z += 4; 451 } 452 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 453 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 454 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 455 PetscFunctionReturn(0); 456 } 457 458 #undef __FUNCT__ 459 #define __FUNCT__ "MatMult_SeqBAIJ_5" 460 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 461 { 462 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 463 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 464 const PetscScalar *xb,*x; 465 const MatScalar *v; 466 PetscErrorCode ierr; 467 const PetscInt *idx,*ii,*ridx=NULL; 468 PetscInt mbs,i,j,n; 469 PetscBool usecprow=a->compressedrow.use; 470 471 PetscFunctionBegin; 472 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 473 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 474 475 idx = a->j; 476 v = a->a; 477 if (usecprow) { 478 mbs = a->compressedrow.nrows; 479 ii = a->compressedrow.i; 480 ridx = a->compressedrow.rindex; 481 } else { 482 mbs = a->mbs; 483 ii = a->i; 484 z = zarray; 485 } 486 487 for (i=0; i<mbs; i++) { 488 n = ii[1] - ii[0]; ii++; 489 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 490 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 491 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 492 for (j=0; j<n; j++) { 493 xb = x + 5*(*idx++); 494 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 495 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 496 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 497 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 498 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 499 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 500 v += 25; 501 } 502 if (usecprow) z = zarray + 5*ridx[i]; 503 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 504 if (!usecprow) z += 5; 505 } 506 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 507 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 508 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 509 PetscFunctionReturn(0); 510 } 511 512 513 514 #undef __FUNCT__ 515 #define __FUNCT__ "MatMult_SeqBAIJ_6" 516 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 517 { 518 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 519 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 520 const PetscScalar *x,*xb; 521 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 522 const MatScalar *v; 523 PetscErrorCode ierr; 524 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 525 PetscBool usecprow=a->compressedrow.use; 526 527 PetscFunctionBegin; 528 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 529 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 530 531 idx = a->j; 532 v = a->a; 533 if (usecprow) { 534 mbs = a->compressedrow.nrows; 535 ii = a->compressedrow.i; 536 ridx = a->compressedrow.rindex; 537 } else { 538 mbs = a->mbs; 539 ii = a->i; 540 z = zarray; 541 } 542 543 for (i=0; i<mbs; i++) { 544 n = ii[1] - ii[0]; 545 ii++; 546 sum1 = 0.0; 547 sum2 = 0.0; 548 sum3 = 0.0; 549 sum4 = 0.0; 550 sum5 = 0.0; 551 sum6 = 0.0; 552 553 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 554 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 555 for (j=0; j<n; j++) { 556 xb = x + 6*(*idx++); 557 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 558 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 559 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 560 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 561 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 562 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 563 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 564 v += 36; 565 } 566 if (usecprow) z = zarray + 6*ridx[i]; 567 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 568 if (!usecprow) z += 6; 569 } 570 571 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 572 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 573 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 574 PetscFunctionReturn(0); 575 } 576 577 #undef __FUNCT__ 578 #define __FUNCT__ "MatMult_SeqBAIJ_7" 579 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 580 { 581 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 582 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 583 const PetscScalar *x,*xb; 584 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 585 const MatScalar *v; 586 PetscErrorCode ierr; 587 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 588 PetscBool usecprow=a->compressedrow.use; 589 590 PetscFunctionBegin; 591 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 592 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 593 594 idx = a->j; 595 v = a->a; 596 if (usecprow) { 597 mbs = a->compressedrow.nrows; 598 ii = a->compressedrow.i; 599 ridx = a->compressedrow.rindex; 600 } else { 601 mbs = a->mbs; 602 ii = a->i; 603 z = zarray; 604 } 605 606 for (i=0; i<mbs; i++) { 607 n = ii[1] - ii[0]; 608 ii++; 609 sum1 = 0.0; 610 sum2 = 0.0; 611 sum3 = 0.0; 612 sum4 = 0.0; 613 sum5 = 0.0; 614 sum6 = 0.0; 615 sum7 = 0.0; 616 617 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 618 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 619 for (j=0; j<n; j++) { 620 xb = x + 7*(*idx++); 621 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 622 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 623 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 624 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 625 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 626 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 627 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 628 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 629 v += 49; 630 } 631 if (usecprow) z = zarray + 7*ridx[i]; 632 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 633 if (!usecprow) z += 7; 634 } 635 636 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 637 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 638 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 639 PetscFunctionReturn(0); 640 } 641 642 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 643 /* Default MatMult for block size 15 */ 644 645 #undef __FUNCT__ 646 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 647 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 648 { 649 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 650 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 651 const PetscScalar *x,*xb; 652 PetscScalar *zarray,xv; 653 const MatScalar *v; 654 PetscErrorCode ierr; 655 const PetscInt *ii,*ij=a->j,*idx; 656 PetscInt mbs,i,j,k,n,*ridx=NULL; 657 PetscBool usecprow=a->compressedrow.use; 658 659 PetscFunctionBegin; 660 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 661 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 662 663 v = a->a; 664 if (usecprow) { 665 mbs = a->compressedrow.nrows; 666 ii = a->compressedrow.i; 667 ridx = a->compressedrow.rindex; 668 } else { 669 mbs = a->mbs; 670 ii = a->i; 671 z = zarray; 672 } 673 674 for (i=0; i<mbs; i++) { 675 n = ii[i+1] - ii[i]; 676 idx = ij + ii[i]; 677 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 678 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 679 680 for (j=0; j<n; j++) { 681 xb = x + 15*(idx[j]); 682 683 for (k=0; k<15; k++) { 684 xv = xb[k]; 685 sum1 += v[0]*xv; 686 sum2 += v[1]*xv; 687 sum3 += v[2]*xv; 688 sum4 += v[3]*xv; 689 sum5 += v[4]*xv; 690 sum6 += v[5]*xv; 691 sum7 += v[6]*xv; 692 sum8 += v[7]*xv; 693 sum9 += v[8]*xv; 694 sum10 += v[9]*xv; 695 sum11 += v[10]*xv; 696 sum12 += v[11]*xv; 697 sum13 += v[12]*xv; 698 sum14 += v[13]*xv; 699 sum15 += v[14]*xv; 700 v += 15; 701 } 702 } 703 if (usecprow) z = zarray + 15*ridx[i]; 704 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 705 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 706 707 if (!usecprow) z += 15; 708 } 709 710 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 711 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 712 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 713 PetscFunctionReturn(0); 714 } 715 716 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 717 #undef __FUNCT__ 718 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 719 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 720 { 721 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 722 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 723 const PetscScalar *x,*xb; 724 PetscScalar x1,x2,x3,x4,*zarray; 725 const MatScalar *v; 726 PetscErrorCode ierr; 727 const PetscInt *ii,*ij=a->j,*idx; 728 PetscInt mbs,i,j,n,*ridx=NULL; 729 PetscBool usecprow=a->compressedrow.use; 730 731 PetscFunctionBegin; 732 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 733 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 734 735 v = a->a; 736 if (usecprow) { 737 mbs = a->compressedrow.nrows; 738 ii = a->compressedrow.i; 739 ridx = a->compressedrow.rindex; 740 } else { 741 mbs = a->mbs; 742 ii = a->i; 743 z = zarray; 744 } 745 746 for (i=0; i<mbs; i++) { 747 n = ii[i+1] - ii[i]; 748 idx = ij + ii[i]; 749 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 750 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 751 752 for (j=0; j<n; j++) { 753 xb = x + 15*(idx[j]); 754 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 755 756 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 757 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 758 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 759 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 760 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 761 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 762 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 763 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 764 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 765 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 766 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 767 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 768 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 769 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 770 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 771 772 v += 60; 773 774 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 775 776 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 777 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 778 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 779 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 780 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 781 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 782 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 783 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 784 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 785 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 786 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 787 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 788 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 789 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 790 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 791 v += 60; 792 793 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 794 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 795 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 796 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 797 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 798 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 799 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 800 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 801 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 802 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 803 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 804 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 805 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 806 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 807 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 808 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 809 v += 60; 810 811 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 812 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 813 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 814 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 815 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 816 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 817 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 818 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 819 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 820 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 821 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 822 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 823 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 824 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 825 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 826 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 827 v += 45; 828 } 829 if (usecprow) z = zarray + 15*ridx[i]; 830 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 831 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 832 833 if (!usecprow) z += 15; 834 } 835 836 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 837 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 838 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 839 PetscFunctionReturn(0); 840 } 841 842 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 843 #undef __FUNCT__ 844 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 845 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 846 { 847 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 848 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 849 const PetscScalar *x,*xb; 850 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 851 const MatScalar *v; 852 PetscErrorCode ierr; 853 const PetscInt *ii,*ij=a->j,*idx; 854 PetscInt mbs,i,j,n,*ridx=NULL; 855 PetscBool usecprow=a->compressedrow.use; 856 857 PetscFunctionBegin; 858 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 859 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 860 861 v = a->a; 862 if (usecprow) { 863 mbs = a->compressedrow.nrows; 864 ii = a->compressedrow.i; 865 ridx = a->compressedrow.rindex; 866 } else { 867 mbs = a->mbs; 868 ii = a->i; 869 z = zarray; 870 } 871 872 for (i=0; i<mbs; i++) { 873 n = ii[i+1] - ii[i]; 874 idx = ij + ii[i]; 875 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 876 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 877 878 for (j=0; j<n; j++) { 879 xb = x + 15*(idx[j]); 880 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 881 x8 = xb[7]; 882 883 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 884 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 885 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 886 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 887 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 888 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 889 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 890 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 891 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 892 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 893 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 894 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 895 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 896 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 897 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 898 v += 120; 899 900 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 901 902 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 903 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 904 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 905 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 906 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 907 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 908 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 909 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 910 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 911 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 912 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 913 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 914 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 915 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 916 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 917 v += 105; 918 } 919 if (usecprow) z = zarray + 15*ridx[i]; 920 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 921 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 922 923 if (!usecprow) z += 15; 924 } 925 926 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 927 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 928 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 929 PetscFunctionReturn(0); 930 } 931 932 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 933 934 #undef __FUNCT__ 935 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 936 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 937 { 938 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 939 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 940 const PetscScalar *x,*xb; 941 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 942 const MatScalar *v; 943 PetscErrorCode ierr; 944 const PetscInt *ii,*ij=a->j,*idx; 945 PetscInt mbs,i,j,n,*ridx=NULL; 946 PetscBool usecprow=a->compressedrow.use; 947 948 PetscFunctionBegin; 949 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 950 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 951 952 v = a->a; 953 if (usecprow) { 954 mbs = a->compressedrow.nrows; 955 ii = a->compressedrow.i; 956 ridx = a->compressedrow.rindex; 957 } else { 958 mbs = a->mbs; 959 ii = a->i; 960 z = zarray; 961 } 962 963 for (i=0; i<mbs; i++) { 964 n = ii[i+1] - ii[i]; 965 idx = ij + ii[i]; 966 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 967 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 968 969 for (j=0; j<n; j++) { 970 xb = x + 15*(idx[j]); 971 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 972 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 973 974 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 975 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 976 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 977 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 978 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 979 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 980 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 981 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 982 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 983 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 984 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 985 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 986 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 987 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 988 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 989 v += 225; 990 } 991 if (usecprow) z = zarray + 15*ridx[i]; 992 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 993 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 994 995 if (!usecprow) z += 15; 996 } 997 998 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 999 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1000 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1001 PetscFunctionReturn(0); 1002 } 1003 1004 1005 /* 1006 This will not work with MatScalar == float because it calls the BLAS 1007 */ 1008 #undef __FUNCT__ 1009 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1010 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1011 { 1012 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1013 PetscScalar *z = 0,*work,*workt,*zarray; 1014 const PetscScalar *x,*xb; 1015 const MatScalar *v; 1016 PetscErrorCode ierr; 1017 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1018 const PetscInt *idx,*ii,*ridx=NULL; 1019 PetscInt ncols,k; 1020 PetscBool usecprow=a->compressedrow.use; 1021 1022 PetscFunctionBegin; 1023 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1024 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1025 1026 idx = a->j; 1027 v = a->a; 1028 if (usecprow) { 1029 mbs = a->compressedrow.nrows; 1030 ii = a->compressedrow.i; 1031 ridx = a->compressedrow.rindex; 1032 } else { 1033 mbs = a->mbs; 1034 ii = a->i; 1035 z = zarray; 1036 } 1037 1038 if (!a->mult_work) { 1039 k = PetscMax(A->rmap->n,A->cmap->n); 1040 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1041 } 1042 work = a->mult_work; 1043 for (i=0; i<mbs; i++) { 1044 n = ii[1] - ii[0]; ii++; 1045 ncols = n*bs; 1046 workt = work; 1047 for (j=0; j<n; j++) { 1048 xb = x + bs*(*idx++); 1049 for (k=0; k<bs; k++) workt[k] = xb[k]; 1050 workt += bs; 1051 } 1052 if (usecprow) z = zarray + bs*ridx[i]; 1053 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1054 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1055 v += n*bs2; 1056 if (!usecprow) z += bs; 1057 } 1058 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1059 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1060 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1061 PetscFunctionReturn(0); 1062 } 1063 1064 #undef __FUNCT__ 1065 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1066 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1067 { 1068 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1069 const PetscScalar *x; 1070 PetscScalar *y,*z,sum; 1071 const MatScalar *v; 1072 PetscErrorCode ierr; 1073 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1074 const PetscInt *idx,*ii; 1075 PetscBool usecprow=a->compressedrow.use; 1076 1077 PetscFunctionBegin; 1078 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1079 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1080 1081 idx = a->j; 1082 v = a->a; 1083 if (usecprow) { 1084 if (zz != yy) { 1085 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1086 } 1087 mbs = a->compressedrow.nrows; 1088 ii = a->compressedrow.i; 1089 ridx = a->compressedrow.rindex; 1090 } else { 1091 ii = a->i; 1092 } 1093 1094 for (i=0; i<mbs; i++) { 1095 n = ii[1] - ii[0]; 1096 ii++; 1097 if (!usecprow) { 1098 sum = y[i]; 1099 } else { 1100 sum = y[ridx[i]]; 1101 } 1102 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1103 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1104 PetscSparseDensePlusDot(sum,x,v,idx,n); 1105 v += n; 1106 idx += n; 1107 if (usecprow) { 1108 z[ridx[i]] = sum; 1109 } else { 1110 z[i] = sum; 1111 } 1112 } 1113 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1114 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1115 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1116 PetscFunctionReturn(0); 1117 } 1118 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1121 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1122 { 1123 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1124 PetscScalar *y = 0,*z = 0,sum1,sum2; 1125 const PetscScalar *x,*xb; 1126 PetscScalar x1,x2,*yarray,*zarray; 1127 const MatScalar *v; 1128 PetscErrorCode ierr; 1129 PetscInt mbs = a->mbs,i,n,j; 1130 const PetscInt *idx,*ii,*ridx = NULL; 1131 PetscBool usecprow = a->compressedrow.use; 1132 1133 PetscFunctionBegin; 1134 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1135 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1136 1137 idx = a->j; 1138 v = a->a; 1139 if (usecprow) { 1140 if (zz != yy) { 1141 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 } 1143 mbs = a->compressedrow.nrows; 1144 ii = a->compressedrow.i; 1145 ridx = a->compressedrow.rindex; 1146 if (zz != yy) { 1147 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 } 1149 } else { 1150 ii = a->i; 1151 y = yarray; 1152 z = zarray; 1153 } 1154 1155 for (i=0; i<mbs; i++) { 1156 n = ii[1] - ii[0]; ii++; 1157 if (usecprow) { 1158 z = zarray + 2*ridx[i]; 1159 y = yarray + 2*ridx[i]; 1160 } 1161 sum1 = y[0]; sum2 = y[1]; 1162 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1163 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1164 for (j=0; j<n; j++) { 1165 xb = x + 2*(*idx++); 1166 x1 = xb[0]; 1167 x2 = xb[1]; 1168 1169 sum1 += v[0]*x1 + v[2]*x2; 1170 sum2 += v[1]*x1 + v[3]*x2; 1171 v += 4; 1172 } 1173 z[0] = sum1; z[1] = sum2; 1174 if (!usecprow) { 1175 z += 2; y += 2; 1176 } 1177 } 1178 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1179 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1180 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1181 PetscFunctionReturn(0); 1182 } 1183 1184 #undef __FUNCT__ 1185 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1186 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1187 { 1188 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1189 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1190 const PetscScalar *x,*xb; 1191 const MatScalar *v; 1192 PetscErrorCode ierr; 1193 PetscInt mbs = a->mbs,i,j,n; 1194 const PetscInt *idx,*ii,*ridx = NULL; 1195 PetscBool usecprow = a->compressedrow.use; 1196 1197 PetscFunctionBegin; 1198 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1199 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1200 1201 idx = a->j; 1202 v = a->a; 1203 if (usecprow) { 1204 if (zz != yy) { 1205 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1206 } 1207 mbs = a->compressedrow.nrows; 1208 ii = a->compressedrow.i; 1209 ridx = a->compressedrow.rindex; 1210 } else { 1211 ii = a->i; 1212 y = yarray; 1213 z = zarray; 1214 } 1215 1216 for (i=0; i<mbs; i++) { 1217 n = ii[1] - ii[0]; ii++; 1218 if (usecprow) { 1219 z = zarray + 3*ridx[i]; 1220 y = yarray + 3*ridx[i]; 1221 } 1222 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1223 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1224 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1225 for (j=0; j<n; j++) { 1226 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1227 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1228 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1229 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1230 v += 9; 1231 } 1232 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1233 if (!usecprow) { 1234 z += 3; y += 3; 1235 } 1236 } 1237 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1238 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1239 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1240 PetscFunctionReturn(0); 1241 } 1242 1243 #undef __FUNCT__ 1244 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1245 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1246 { 1247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1248 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1249 const PetscScalar *x,*xb; 1250 const MatScalar *v; 1251 PetscErrorCode ierr; 1252 PetscInt mbs = a->mbs,i,j,n; 1253 const PetscInt *idx,*ii,*ridx=NULL; 1254 PetscBool usecprow=a->compressedrow.use; 1255 1256 PetscFunctionBegin; 1257 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1258 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1259 1260 idx = a->j; 1261 v = a->a; 1262 if (usecprow) { 1263 if (zz != yy) { 1264 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1265 } 1266 mbs = a->compressedrow.nrows; 1267 ii = a->compressedrow.i; 1268 ridx = a->compressedrow.rindex; 1269 } else { 1270 ii = a->i; 1271 y = yarray; 1272 z = zarray; 1273 } 1274 1275 for (i=0; i<mbs; i++) { 1276 n = ii[1] - ii[0]; ii++; 1277 if (usecprow) { 1278 z = zarray + 4*ridx[i]; 1279 y = yarray + 4*ridx[i]; 1280 } 1281 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1282 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1283 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1284 for (j=0; j<n; j++) { 1285 xb = x + 4*(*idx++); 1286 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1287 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1288 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1289 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1290 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1291 v += 16; 1292 } 1293 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1294 if (!usecprow) { 1295 z += 4; y += 4; 1296 } 1297 } 1298 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1299 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1300 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1301 PetscFunctionReturn(0); 1302 } 1303 1304 #undef __FUNCT__ 1305 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1306 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1307 { 1308 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1309 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1310 const PetscScalar *x,*xb; 1311 PetscScalar *yarray,*zarray; 1312 const MatScalar *v; 1313 PetscErrorCode ierr; 1314 PetscInt mbs = a->mbs,i,j,n; 1315 const PetscInt *idx,*ii,*ridx = NULL; 1316 PetscBool usecprow=a->compressedrow.use; 1317 1318 PetscFunctionBegin; 1319 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1320 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1321 1322 idx = a->j; 1323 v = a->a; 1324 if (usecprow) { 1325 if (zz != yy) { 1326 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1327 } 1328 mbs = a->compressedrow.nrows; 1329 ii = a->compressedrow.i; 1330 ridx = a->compressedrow.rindex; 1331 } else { 1332 ii = a->i; 1333 y = yarray; 1334 z = zarray; 1335 } 1336 1337 for (i=0; i<mbs; i++) { 1338 n = ii[1] - ii[0]; ii++; 1339 if (usecprow) { 1340 z = zarray + 5*ridx[i]; 1341 y = yarray + 5*ridx[i]; 1342 } 1343 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1344 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1345 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1346 for (j=0; j<n; j++) { 1347 xb = x + 5*(*idx++); 1348 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1349 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1350 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1351 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1352 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1353 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1354 v += 25; 1355 } 1356 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1357 if (!usecprow) { 1358 z += 5; y += 5; 1359 } 1360 } 1361 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1362 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1363 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1364 PetscFunctionReturn(0); 1365 } 1366 #undef __FUNCT__ 1367 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1368 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1369 { 1370 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1371 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1372 const PetscScalar *x,*xb; 1373 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1374 const MatScalar *v; 1375 PetscErrorCode ierr; 1376 PetscInt mbs = a->mbs,i,j,n; 1377 const PetscInt *idx,*ii,*ridx=NULL; 1378 PetscBool usecprow=a->compressedrow.use; 1379 1380 PetscFunctionBegin; 1381 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1382 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1383 1384 idx = a->j; 1385 v = a->a; 1386 if (usecprow) { 1387 if (zz != yy) { 1388 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1389 } 1390 mbs = a->compressedrow.nrows; 1391 ii = a->compressedrow.i; 1392 ridx = a->compressedrow.rindex; 1393 } else { 1394 ii = a->i; 1395 y = yarray; 1396 z = zarray; 1397 } 1398 1399 for (i=0; i<mbs; i++) { 1400 n = ii[1] - ii[0]; ii++; 1401 if (usecprow) { 1402 z = zarray + 6*ridx[i]; 1403 y = yarray + 6*ridx[i]; 1404 } 1405 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1406 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1407 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1408 for (j=0; j<n; j++) { 1409 xb = x + 6*(*idx++); 1410 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1411 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1412 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1413 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1414 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1415 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1416 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1417 v += 36; 1418 } 1419 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1420 if (!usecprow) { 1421 z += 6; y += 6; 1422 } 1423 } 1424 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1425 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1426 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 #undef __FUNCT__ 1431 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1432 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1433 { 1434 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1435 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1436 const PetscScalar *x,*xb; 1437 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1438 const MatScalar *v; 1439 PetscErrorCode ierr; 1440 PetscInt mbs = a->mbs,i,j,n; 1441 const PetscInt *idx,*ii,*ridx = NULL; 1442 PetscBool usecprow=a->compressedrow.use; 1443 1444 PetscFunctionBegin; 1445 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1446 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1447 1448 idx = a->j; 1449 v = a->a; 1450 if (usecprow) { 1451 if (zz != yy) { 1452 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1453 } 1454 mbs = a->compressedrow.nrows; 1455 ii = a->compressedrow.i; 1456 ridx = a->compressedrow.rindex; 1457 } else { 1458 ii = a->i; 1459 y = yarray; 1460 z = zarray; 1461 } 1462 1463 for (i=0; i<mbs; i++) { 1464 n = ii[1] - ii[0]; ii++; 1465 if (usecprow) { 1466 z = zarray + 7*ridx[i]; 1467 y = yarray + 7*ridx[i]; 1468 } 1469 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1470 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1471 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1472 for (j=0; j<n; j++) { 1473 xb = x + 7*(*idx++); 1474 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1475 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1476 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1477 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1478 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1479 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1480 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1481 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1482 v += 49; 1483 } 1484 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1485 if (!usecprow) { 1486 z += 7; y += 7; 1487 } 1488 } 1489 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1490 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1491 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1492 PetscFunctionReturn(0); 1493 } 1494 1495 #undef __FUNCT__ 1496 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1497 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1498 { 1499 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1500 PetscScalar *z = 0,*work,*workt,*zarray; 1501 const PetscScalar *x,*xb; 1502 const MatScalar *v; 1503 PetscErrorCode ierr; 1504 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1505 PetscInt ncols,k; 1506 const PetscInt *ridx = NULL,*idx,*ii; 1507 PetscBool usecprow = a->compressedrow.use; 1508 1509 PetscFunctionBegin; 1510 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1511 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1512 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1513 1514 idx = a->j; 1515 v = a->a; 1516 if (usecprow) { 1517 mbs = a->compressedrow.nrows; 1518 ii = a->compressedrow.i; 1519 ridx = a->compressedrow.rindex; 1520 } else { 1521 mbs = a->mbs; 1522 ii = a->i; 1523 z = zarray; 1524 } 1525 1526 if (!a->mult_work) { 1527 k = PetscMax(A->rmap->n,A->cmap->n); 1528 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1529 } 1530 work = a->mult_work; 1531 for (i=0; i<mbs; i++) { 1532 n = ii[1] - ii[0]; ii++; 1533 ncols = n*bs; 1534 workt = work; 1535 for (j=0; j<n; j++) { 1536 xb = x + bs*(*idx++); 1537 for (k=0; k<bs; k++) workt[k] = xb[k]; 1538 workt += bs; 1539 } 1540 if (usecprow) z = zarray + bs*ridx[i]; 1541 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1542 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1543 v += n*bs2; 1544 if (!usecprow) z += bs; 1545 } 1546 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1547 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1548 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 #undef __FUNCT__ 1553 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1554 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1555 { 1556 PetscScalar zero = 0.0; 1557 PetscErrorCode ierr; 1558 1559 PetscFunctionBegin; 1560 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1561 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1567 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1568 { 1569 PetscScalar zero = 0.0; 1570 PetscErrorCode ierr; 1571 1572 PetscFunctionBegin; 1573 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1574 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 #undef __FUNCT__ 1579 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1580 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1581 { 1582 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1583 PetscScalar *z,x1,x2,x3,x4,x5; 1584 const PetscScalar *x,*xb = NULL; 1585 const MatScalar *v; 1586 PetscErrorCode ierr; 1587 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1588 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1589 Mat_CompressedRow cprow = a->compressedrow; 1590 PetscBool usecprow = cprow.use; 1591 1592 PetscFunctionBegin; 1593 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1594 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1595 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1596 1597 idx = a->j; 1598 v = a->a; 1599 if (usecprow) { 1600 mbs = cprow.nrows; 1601 ii = cprow.i; 1602 ridx = cprow.rindex; 1603 } else { 1604 mbs=a->mbs; 1605 ii = a->i; 1606 xb = x; 1607 } 1608 1609 switch (bs) { 1610 case 1: 1611 for (i=0; i<mbs; i++) { 1612 if (usecprow) xb = x + ridx[i]; 1613 x1 = xb[0]; 1614 ib = idx + ii[0]; 1615 n = ii[1] - ii[0]; ii++; 1616 for (j=0; j<n; j++) { 1617 rval = ib[j]; 1618 z[rval] += PetscConj(*v) * x1; 1619 v++; 1620 } 1621 if (!usecprow) xb++; 1622 } 1623 break; 1624 case 2: 1625 for (i=0; i<mbs; i++) { 1626 if (usecprow) xb = x + 2*ridx[i]; 1627 x1 = xb[0]; x2 = xb[1]; 1628 ib = idx + ii[0]; 1629 n = ii[1] - ii[0]; ii++; 1630 for (j=0; j<n; j++) { 1631 rval = ib[j]*2; 1632 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1633 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1634 v += 4; 1635 } 1636 if (!usecprow) xb += 2; 1637 } 1638 break; 1639 case 3: 1640 for (i=0; i<mbs; i++) { 1641 if (usecprow) xb = x + 3*ridx[i]; 1642 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1643 ib = idx + ii[0]; 1644 n = ii[1] - ii[0]; ii++; 1645 for (j=0; j<n; j++) { 1646 rval = ib[j]*3; 1647 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1648 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1649 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1650 v += 9; 1651 } 1652 if (!usecprow) xb += 3; 1653 } 1654 break; 1655 case 4: 1656 for (i=0; i<mbs; i++) { 1657 if (usecprow) xb = x + 4*ridx[i]; 1658 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1659 ib = idx + ii[0]; 1660 n = ii[1] - ii[0]; ii++; 1661 for (j=0; j<n; j++) { 1662 rval = ib[j]*4; 1663 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1664 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1665 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1666 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1667 v += 16; 1668 } 1669 if (!usecprow) xb += 4; 1670 } 1671 break; 1672 case 5: 1673 for (i=0; i<mbs; i++) { 1674 if (usecprow) xb = x + 5*ridx[i]; 1675 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1676 x4 = xb[3]; x5 = xb[4]; 1677 ib = idx + ii[0]; 1678 n = ii[1] - ii[0]; ii++; 1679 for (j=0; j<n; j++) { 1680 rval = ib[j]*5; 1681 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1682 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1683 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1684 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1685 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1686 v += 25; 1687 } 1688 if (!usecprow) xb += 5; 1689 } 1690 break; 1691 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1692 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1693 #if 0 1694 { 1695 PetscInt ncols,k,bs2=a->bs2; 1696 PetscScalar *work,*workt,zb; 1697 const PetscScalar *xtmp; 1698 if (!a->mult_work) { 1699 k = PetscMax(A->rmap->n,A->cmap->n); 1700 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1701 } 1702 work = a->mult_work; 1703 xtmp = x; 1704 for (i=0; i<mbs; i++) { 1705 n = ii[1] - ii[0]; ii++; 1706 ncols = n*bs; 1707 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1708 if (usecprow) xtmp = x + bs*ridx[i]; 1709 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1710 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1711 v += n*bs2; 1712 if (!usecprow) xtmp += bs; 1713 workt = work; 1714 for (j=0; j<n; j++) { 1715 zb = z + bs*(*idx++); 1716 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1717 workt += bs; 1718 } 1719 } 1720 } 1721 #endif 1722 } 1723 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1724 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1725 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1726 PetscFunctionReturn(0); 1727 } 1728 1729 #undef __FUNCT__ 1730 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1731 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1732 { 1733 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1734 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1735 const PetscScalar *x,*xb = 0; 1736 const MatScalar *v; 1737 PetscErrorCode ierr; 1738 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1739 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1740 Mat_CompressedRow cprow = a->compressedrow; 1741 PetscBool usecprow=cprow.use; 1742 1743 PetscFunctionBegin; 1744 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1745 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1746 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1747 1748 idx = a->j; 1749 v = a->a; 1750 if (usecprow) { 1751 mbs = cprow.nrows; 1752 ii = cprow.i; 1753 ridx = cprow.rindex; 1754 } else { 1755 mbs=a->mbs; 1756 ii = a->i; 1757 xb = x; 1758 } 1759 1760 switch (bs) { 1761 case 1: 1762 for (i=0; i<mbs; i++) { 1763 if (usecprow) xb = x + ridx[i]; 1764 x1 = xb[0]; 1765 ib = idx + ii[0]; 1766 n = ii[1] - ii[0]; ii++; 1767 for (j=0; j<n; j++) { 1768 rval = ib[j]; 1769 z[rval] += *v * x1; 1770 v++; 1771 } 1772 if (!usecprow) xb++; 1773 } 1774 break; 1775 case 2: 1776 for (i=0; i<mbs; i++) { 1777 if (usecprow) xb = x + 2*ridx[i]; 1778 x1 = xb[0]; x2 = xb[1]; 1779 ib = idx + ii[0]; 1780 n = ii[1] - ii[0]; ii++; 1781 for (j=0; j<n; j++) { 1782 rval = ib[j]*2; 1783 z[rval++] += v[0]*x1 + v[1]*x2; 1784 z[rval++] += v[2]*x1 + v[3]*x2; 1785 v += 4; 1786 } 1787 if (!usecprow) xb += 2; 1788 } 1789 break; 1790 case 3: 1791 for (i=0; i<mbs; i++) { 1792 if (usecprow) xb = x + 3*ridx[i]; 1793 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1794 ib = idx + ii[0]; 1795 n = ii[1] - ii[0]; ii++; 1796 for (j=0; j<n; j++) { 1797 rval = ib[j]*3; 1798 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1799 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1800 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1801 v += 9; 1802 } 1803 if (!usecprow) xb += 3; 1804 } 1805 break; 1806 case 4: 1807 for (i=0; i<mbs; i++) { 1808 if (usecprow) xb = x + 4*ridx[i]; 1809 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1810 ib = idx + ii[0]; 1811 n = ii[1] - ii[0]; ii++; 1812 for (j=0; j<n; j++) { 1813 rval = ib[j]*4; 1814 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1815 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1816 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1817 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1818 v += 16; 1819 } 1820 if (!usecprow) xb += 4; 1821 } 1822 break; 1823 case 5: 1824 for (i=0; i<mbs; i++) { 1825 if (usecprow) xb = x + 5*ridx[i]; 1826 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1827 x4 = xb[3]; x5 = xb[4]; 1828 ib = idx + ii[0]; 1829 n = ii[1] - ii[0]; ii++; 1830 for (j=0; j<n; j++) { 1831 rval = ib[j]*5; 1832 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1833 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1834 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1835 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1836 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1837 v += 25; 1838 } 1839 if (!usecprow) xb += 5; 1840 } 1841 break; 1842 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1843 PetscInt ncols,k; 1844 PetscScalar *work,*workt; 1845 const PetscScalar *xtmp; 1846 if (!a->mult_work) { 1847 k = PetscMax(A->rmap->n,A->cmap->n); 1848 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1849 } 1850 work = a->mult_work; 1851 xtmp = x; 1852 for (i=0; i<mbs; i++) { 1853 n = ii[1] - ii[0]; ii++; 1854 ncols = n*bs; 1855 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1856 if (usecprow) xtmp = x + bs*ridx[i]; 1857 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1858 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1859 v += n*bs2; 1860 if (!usecprow) xtmp += bs; 1861 workt = work; 1862 for (j=0; j<n; j++) { 1863 zb = z + bs*(*idx++); 1864 for (k=0; k<bs; k++) zb[k] += workt[k]; 1865 workt += bs; 1866 } 1867 } 1868 } 1869 } 1870 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1871 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1873 PetscFunctionReturn(0); 1874 } 1875 1876 #undef __FUNCT__ 1877 #define __FUNCT__ "MatScale_SeqBAIJ" 1878 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1879 { 1880 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1881 PetscInt totalnz = a->bs2*a->nz; 1882 PetscScalar oalpha = alpha; 1883 PetscErrorCode ierr; 1884 PetscBLASInt one = 1,tnz; 1885 1886 PetscFunctionBegin; 1887 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1888 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1889 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 #undef __FUNCT__ 1894 #define __FUNCT__ "MatNorm_SeqBAIJ" 1895 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1896 { 1897 PetscErrorCode ierr; 1898 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1899 MatScalar *v = a->a; 1900 PetscReal sum = 0.0; 1901 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1902 1903 PetscFunctionBegin; 1904 if (type == NORM_FROBENIUS) { 1905 for (i=0; i< bs2*nz; i++) { 1906 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1907 } 1908 *norm = PetscSqrtReal(sum); 1909 } else if (type == NORM_1) { /* maximum column sum */ 1910 PetscReal *tmp; 1911 PetscInt *bcol = a->j; 1912 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 1913 for (i=0; i<nz; i++) { 1914 for (j=0; j<bs; j++) { 1915 k1 = bs*(*bcol) + j; /* column index */ 1916 for (k=0; k<bs; k++) { 1917 tmp[k1] += PetscAbsScalar(*v); v++; 1918 } 1919 } 1920 bcol++; 1921 } 1922 *norm = 0.0; 1923 for (j=0; j<A->cmap->n; j++) { 1924 if (tmp[j] > *norm) *norm = tmp[j]; 1925 } 1926 ierr = PetscFree(tmp);CHKERRQ(ierr); 1927 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1928 *norm = 0.0; 1929 for (k=0; k<bs; k++) { 1930 for (j=0; j<a->mbs; j++) { 1931 v = a->a + bs2*a->i[j] + k; 1932 sum = 0.0; 1933 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1934 for (k1=0; k1<bs; k1++) { 1935 sum += PetscAbsScalar(*v); 1936 v += bs; 1937 } 1938 } 1939 if (sum > *norm) *norm = sum; 1940 } 1941 } 1942 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1943 PetscFunctionReturn(0); 1944 } 1945 1946 1947 #undef __FUNCT__ 1948 #define __FUNCT__ "MatEqual_SeqBAIJ" 1949 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1950 { 1951 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1952 PetscErrorCode ierr; 1953 1954 PetscFunctionBegin; 1955 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1956 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1957 *flg = PETSC_FALSE; 1958 PetscFunctionReturn(0); 1959 } 1960 1961 /* if the a->i are the same */ 1962 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1963 if (!*flg) PetscFunctionReturn(0); 1964 1965 /* if a->j are the same */ 1966 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1967 if (!*flg) PetscFunctionReturn(0); 1968 1969 /* if a->a are the same */ 1970 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1971 PetscFunctionReturn(0); 1972 1973 } 1974 1975 #undef __FUNCT__ 1976 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1977 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1978 { 1979 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1980 PetscErrorCode ierr; 1981 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1982 PetscScalar *x,zero = 0.0; 1983 MatScalar *aa,*aa_j; 1984 1985 PetscFunctionBegin; 1986 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1987 bs = A->rmap->bs; 1988 aa = a->a; 1989 ai = a->i; 1990 aj = a->j; 1991 ambs = a->mbs; 1992 bs2 = a->bs2; 1993 1994 ierr = VecSet(v,zero);CHKERRQ(ierr); 1995 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1996 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1997 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1998 for (i=0; i<ambs; i++) { 1999 for (j=ai[i]; j<ai[i+1]; j++) { 2000 if (aj[j] == i) { 2001 row = i*bs; 2002 aa_j = aa+j*bs2; 2003 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2004 break; 2005 } 2006 } 2007 } 2008 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2009 PetscFunctionReturn(0); 2010 } 2011 2012 #undef __FUNCT__ 2013 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2014 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2015 { 2016 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2017 const PetscScalar *l,*r,*li,*ri; 2018 PetscScalar x; 2019 MatScalar *aa, *v; 2020 PetscErrorCode ierr; 2021 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2022 const PetscInt *ai,*aj; 2023 2024 PetscFunctionBegin; 2025 ai = a->i; 2026 aj = a->j; 2027 aa = a->a; 2028 m = A->rmap->n; 2029 n = A->cmap->n; 2030 bs = A->rmap->bs; 2031 mbs = a->mbs; 2032 bs2 = a->bs2; 2033 if (ll) { 2034 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2035 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2036 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2037 for (i=0; i<mbs; i++) { /* for each block row */ 2038 M = ai[i+1] - ai[i]; 2039 li = l + i*bs; 2040 v = aa + bs2*ai[i]; 2041 for (j=0; j<M; j++) { /* for each block */ 2042 for (k=0; k<bs2; k++) { 2043 (*v++) *= li[k%bs]; 2044 } 2045 } 2046 } 2047 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2048 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2049 } 2050 2051 if (rr) { 2052 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2053 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2054 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2055 for (i=0; i<mbs; i++) { /* for each block row */ 2056 iai = ai[i]; 2057 M = ai[i+1] - iai; 2058 v = aa + bs2*iai; 2059 for (j=0; j<M; j++) { /* for each block */ 2060 ri = r + bs*aj[iai+j]; 2061 for (k=0; k<bs; k++) { 2062 x = ri[k]; 2063 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2064 v += bs; 2065 } 2066 } 2067 } 2068 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2069 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2070 } 2071 PetscFunctionReturn(0); 2072 } 2073 2074 2075 #undef __FUNCT__ 2076 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2077 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2078 { 2079 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2080 2081 PetscFunctionBegin; 2082 info->block_size = a->bs2; 2083 info->nz_allocated = a->bs2*a->maxnz; 2084 info->nz_used = a->bs2*a->nz; 2085 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2086 info->assemblies = A->num_ass; 2087 info->mallocs = A->info.mallocs; 2088 info->memory = ((PetscObject)A)->mem; 2089 if (A->factortype) { 2090 info->fill_ratio_given = A->info.fill_ratio_given; 2091 info->fill_ratio_needed = A->info.fill_ratio_needed; 2092 info->factor_mallocs = A->info.factor_mallocs; 2093 } else { 2094 info->fill_ratio_given = 0; 2095 info->fill_ratio_needed = 0; 2096 info->factor_mallocs = 0; 2097 } 2098 PetscFunctionReturn(0); 2099 } 2100 2101 #undef __FUNCT__ 2102 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2103 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2104 { 2105 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2106 PetscErrorCode ierr; 2107 2108 PetscFunctionBegin; 2109 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2110 PetscFunctionReturn(0); 2111 } 2112