1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n; ++j) { 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 64 } 65 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 66 } 67 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 68 ierr = PetscFree(nidx);CHKERRQ(ierr); 69 ierr = PetscFree(nidx2);CHKERRQ(ierr); 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag; 87 88 PetscFunctionBegin; 89 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 90 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 91 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 93 94 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 95 ssmap = smap; 96 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 97 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 98 /* determine lens of each row */ 99 for (i=0; i<nrows; i++) { 100 kstart = ai[irow[i]]; 101 kend = kstart + a->ilen[irow[i]]; 102 lens[i] = 0; 103 for (k=kstart; k<kend; k++) { 104 if (ssmap[aj[k]]) lens[i]++; 105 } 106 } 107 /* Create and fill new matrix */ 108 if (scall == MAT_REUSE_MATRIX) { 109 c = (Mat_SeqBAIJ*)((*B)->data); 110 111 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 112 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 113 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 114 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 115 C = *B; 116 } else { 117 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 118 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 119 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 120 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 121 } 122 c = (Mat_SeqBAIJ*)(C->data); 123 for (i=0; i<nrows; i++) { 124 row = irow[i]; 125 kstart = ai[row]; 126 kend = kstart + a->ilen[row]; 127 mat_i = c->i[i]; 128 mat_j = c->j + mat_i; 129 mat_a = c->a + mat_i*bs2; 130 mat_ilen = c->ilen + i; 131 for (k=kstart; k<kend; k++) { 132 if ((tcol=ssmap[a->j[k]])) { 133 *mat_j++ = tcol - 1; 134 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 135 mat_a += bs2; 136 (*mat_ilen)++; 137 } 138 } 139 } 140 /* sort */ 141 { 142 MatScalar *work; 143 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 144 for (i=0; i<nrows; i++) { 145 PetscInt ilen; 146 mat_i = c->i[i]; 147 mat_j = c->j + mat_i; 148 mat_a = c->a + mat_i*bs2; 149 ilen = c->ilen[i]; 150 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 151 } 152 ierr = PetscFree(work);CHKERRQ(ierr); 153 } 154 155 /* Free work space */ 156 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 157 ierr = PetscFree(smap);CHKERRQ(ierr); 158 ierr = PetscFree(lens);CHKERRQ(ierr); 159 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 160 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 162 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 163 *B = C; 164 PetscFunctionReturn(0); 165 } 166 167 #undef __FUNCT__ 168 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 169 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 170 { 171 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 172 IS is1,is2; 173 PetscErrorCode ierr; 174 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 175 const PetscInt *irow,*icol; 176 177 PetscFunctionBegin; 178 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 179 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 180 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 181 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 182 183 /* Verify if the indices corespond to each element in a block 184 and form the IS with compressed IS */ 185 maxmnbs = PetscMax(a->mbs,a->nbs); 186 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 187 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 188 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 189 for (i=0; i<a->mbs; i++) { 190 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 191 } 192 count = 0; 193 for (i=0; i<nrows; i++) { 194 j = irow[i] / bs; 195 if ((vary[j]--)==bs) iary[count++] = j; 196 } 197 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 198 199 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 200 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 201 for (i=0; i<a->nbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 203 } 204 count = 0; 205 for (i=0; i<ncols; i++) { 206 j = icol[i] / bs; 207 if ((vary[j]--)==bs) iary[count++] = j; 208 } 209 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 212 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 213 214 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 215 ierr = ISDestroy(&is1);CHKERRQ(ierr); 216 ierr = ISDestroy(&is2);CHKERRQ(ierr); 217 PetscFunctionReturn(0); 218 } 219 220 #undef __FUNCT__ 221 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 222 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 223 { 224 PetscErrorCode ierr; 225 PetscInt i; 226 227 PetscFunctionBegin; 228 if (scall == MAT_INITIAL_MATRIX) { 229 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 230 } 231 232 for (i=0; i<n; i++) { 233 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 234 } 235 PetscFunctionReturn(0); 236 } 237 238 239 /* -------------------------------------------------------*/ 240 /* Should check that shapes of vectors and matrices match */ 241 /* -------------------------------------------------------*/ 242 243 #undef __FUNCT__ 244 #define __FUNCT__ "MatMult_SeqBAIJ_1" 245 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 246 { 247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 248 PetscScalar *z,sum; 249 const PetscScalar *x; 250 const MatScalar *v; 251 PetscErrorCode ierr; 252 PetscInt mbs,i,n; 253 const PetscInt *idx,*ii,*ridx=NULL; 254 PetscBool usecprow=a->compressedrow.use; 255 256 PetscFunctionBegin; 257 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 258 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 259 260 if (usecprow) { 261 mbs = a->compressedrow.nrows; 262 ii = a->compressedrow.i; 263 ridx = a->compressedrow.rindex; 264 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 265 } else { 266 mbs = a->mbs; 267 ii = a->i; 268 } 269 270 for (i=0; i<mbs; i++) { 271 n = ii[1] - ii[0]; 272 v = a->a + ii[0]; 273 idx = a->j + ii[0]; 274 ii++; 275 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 276 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 277 sum = 0.0; 278 PetscSparseDensePlusDot(sum,x,v,idx,n); 279 if (usecprow) { 280 z[ridx[i]] = sum; 281 } else { 282 z[i] = sum; 283 } 284 } 285 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 286 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 287 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 288 PetscFunctionReturn(0); 289 } 290 291 #undef __FUNCT__ 292 #define __FUNCT__ "MatMult_SeqBAIJ_2" 293 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 294 { 295 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 296 PetscScalar *z = 0,sum1,sum2,*zarray; 297 const PetscScalar *x,*xb; 298 PetscScalar x1,x2; 299 const MatScalar *v; 300 PetscErrorCode ierr; 301 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 302 PetscBool usecprow=a->compressedrow.use; 303 304 PetscFunctionBegin; 305 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 306 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 307 308 idx = a->j; 309 v = a->a; 310 if (usecprow) { 311 mbs = a->compressedrow.nrows; 312 ii = a->compressedrow.i; 313 ridx = a->compressedrow.rindex; 314 } else { 315 mbs = a->mbs; 316 ii = a->i; 317 z = zarray; 318 } 319 320 for (i=0; i<mbs; i++) { 321 n = ii[1] - ii[0]; ii++; 322 sum1 = 0.0; sum2 = 0.0; 323 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 324 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 325 for (j=0; j<n; j++) { 326 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 327 sum1 += v[0]*x1 + v[2]*x2; 328 sum2 += v[1]*x1 + v[3]*x2; 329 v += 4; 330 } 331 if (usecprow) z = zarray + 2*ridx[i]; 332 z[0] = sum1; z[1] = sum2; 333 if (!usecprow) z += 2; 334 } 335 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 336 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 337 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 338 PetscFunctionReturn(0); 339 } 340 341 #undef __FUNCT__ 342 #define __FUNCT__ "MatMult_SeqBAIJ_3" 343 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 344 { 345 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 346 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 347 const PetscScalar *x,*xb; 348 const MatScalar *v; 349 PetscErrorCode ierr; 350 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 351 PetscBool usecprow=a->compressedrow.use; 352 353 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 354 #pragma disjoint(*v,*z,*xb) 355 #endif 356 357 PetscFunctionBegin; 358 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 359 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 360 361 idx = a->j; 362 v = a->a; 363 if (usecprow) { 364 mbs = a->compressedrow.nrows; 365 ii = a->compressedrow.i; 366 ridx = a->compressedrow.rindex; 367 } else { 368 mbs = a->mbs; 369 ii = a->i; 370 z = zarray; 371 } 372 373 for (i=0; i<mbs; i++) { 374 n = ii[1] - ii[0]; ii++; 375 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 376 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 377 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 378 for (j=0; j<n; j++) { 379 xb = x + 3*(*idx++); 380 x1 = xb[0]; 381 x2 = xb[1]; 382 x3 = xb[2]; 383 384 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 385 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 386 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 387 v += 9; 388 } 389 if (usecprow) z = zarray + 3*ridx[i]; 390 z[0] = sum1; z[1] = sum2; z[2] = sum3; 391 if (!usecprow) z += 3; 392 } 393 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 394 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 395 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 396 PetscFunctionReturn(0); 397 } 398 399 #undef __FUNCT__ 400 #define __FUNCT__ "MatMult_SeqBAIJ_4" 401 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 402 { 403 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 404 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 405 const PetscScalar *x,*xb; 406 const MatScalar *v; 407 PetscErrorCode ierr; 408 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 409 PetscBool usecprow=a->compressedrow.use; 410 411 PetscFunctionBegin; 412 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 413 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 414 415 idx = a->j; 416 v = a->a; 417 if (usecprow) { 418 mbs = a->compressedrow.nrows; 419 ii = a->compressedrow.i; 420 ridx = a->compressedrow.rindex; 421 } else { 422 mbs = a->mbs; 423 ii = a->i; 424 z = zarray; 425 } 426 427 for (i=0; i<mbs; i++) { 428 n = ii[1] - ii[0]; 429 ii++; 430 sum1 = 0.0; 431 sum2 = 0.0; 432 sum3 = 0.0; 433 sum4 = 0.0; 434 435 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 436 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 437 for (j=0; j<n; j++) { 438 xb = x + 4*(*idx++); 439 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 440 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 441 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 442 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 443 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 444 v += 16; 445 } 446 if (usecprow) z = zarray + 4*ridx[i]; 447 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 448 if (!usecprow) z += 4; 449 } 450 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 451 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 452 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 453 PetscFunctionReturn(0); 454 } 455 456 #undef __FUNCT__ 457 #define __FUNCT__ "MatMult_SeqBAIJ_5" 458 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 459 { 460 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 461 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 462 const PetscScalar *xb,*x; 463 const MatScalar *v; 464 PetscErrorCode ierr; 465 const PetscInt *idx,*ii,*ridx=NULL; 466 PetscInt mbs,i,j,n; 467 PetscBool usecprow=a->compressedrow.use; 468 469 PetscFunctionBegin; 470 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 471 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 472 473 idx = a->j; 474 v = a->a; 475 if (usecprow) { 476 mbs = a->compressedrow.nrows; 477 ii = a->compressedrow.i; 478 ridx = a->compressedrow.rindex; 479 } else { 480 mbs = a->mbs; 481 ii = a->i; 482 z = zarray; 483 } 484 485 for (i=0; i<mbs; i++) { 486 n = ii[1] - ii[0]; ii++; 487 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 488 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 489 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 490 for (j=0; j<n; j++) { 491 xb = x + 5*(*idx++); 492 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 493 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 494 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 495 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 496 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 497 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 498 v += 25; 499 } 500 if (usecprow) z = zarray + 5*ridx[i]; 501 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 502 if (!usecprow) z += 5; 503 } 504 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 505 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 506 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 507 PetscFunctionReturn(0); 508 } 509 510 511 512 #undef __FUNCT__ 513 #define __FUNCT__ "MatMult_SeqBAIJ_6" 514 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 515 { 516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 517 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 518 const PetscScalar *x,*xb; 519 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 520 const MatScalar *v; 521 PetscErrorCode ierr; 522 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 523 PetscBool usecprow=a->compressedrow.use; 524 525 PetscFunctionBegin; 526 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 527 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 528 529 idx = a->j; 530 v = a->a; 531 if (usecprow) { 532 mbs = a->compressedrow.nrows; 533 ii = a->compressedrow.i; 534 ridx = a->compressedrow.rindex; 535 } else { 536 mbs = a->mbs; 537 ii = a->i; 538 z = zarray; 539 } 540 541 for (i=0; i<mbs; i++) { 542 n = ii[1] - ii[0]; 543 ii++; 544 sum1 = 0.0; 545 sum2 = 0.0; 546 sum3 = 0.0; 547 sum4 = 0.0; 548 sum5 = 0.0; 549 sum6 = 0.0; 550 551 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 552 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 553 for (j=0; j<n; j++) { 554 xb = x + 6*(*idx++); 555 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 556 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 557 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 558 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 559 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 560 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 561 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 562 v += 36; 563 } 564 if (usecprow) z = zarray + 6*ridx[i]; 565 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 566 if (!usecprow) z += 6; 567 } 568 569 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 570 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 571 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 572 PetscFunctionReturn(0); 573 } 574 575 #undef __FUNCT__ 576 #define __FUNCT__ "MatMult_SeqBAIJ_7" 577 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 578 { 579 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 580 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 581 const PetscScalar *x,*xb; 582 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 583 const MatScalar *v; 584 PetscErrorCode ierr; 585 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 586 PetscBool usecprow=a->compressedrow.use; 587 588 PetscFunctionBegin; 589 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 590 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 591 592 idx = a->j; 593 v = a->a; 594 if (usecprow) { 595 mbs = a->compressedrow.nrows; 596 ii = a->compressedrow.i; 597 ridx = a->compressedrow.rindex; 598 } else { 599 mbs = a->mbs; 600 ii = a->i; 601 z = zarray; 602 } 603 604 for (i=0; i<mbs; i++) { 605 n = ii[1] - ii[0]; 606 ii++; 607 sum1 = 0.0; 608 sum2 = 0.0; 609 sum3 = 0.0; 610 sum4 = 0.0; 611 sum5 = 0.0; 612 sum6 = 0.0; 613 sum7 = 0.0; 614 615 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 616 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 617 for (j=0; j<n; j++) { 618 xb = x + 7*(*idx++); 619 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 620 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 621 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 622 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 623 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 624 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 625 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 626 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 627 v += 49; 628 } 629 if (usecprow) z = zarray + 7*ridx[i]; 630 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 631 if (!usecprow) z += 7; 632 } 633 634 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 635 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 636 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 637 PetscFunctionReturn(0); 638 } 639 640 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 641 /* Default MatMult for block size 15 */ 642 643 #undef __FUNCT__ 644 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 645 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 646 { 647 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 648 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 649 const PetscScalar *x,*xb; 650 PetscScalar *zarray,xv; 651 const MatScalar *v; 652 PetscErrorCode ierr; 653 const PetscInt *ii,*ij=a->j,*idx; 654 PetscInt mbs,i,j,k,n,*ridx=NULL; 655 PetscBool usecprow=a->compressedrow.use; 656 657 PetscFunctionBegin; 658 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 659 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 660 661 v = a->a; 662 if (usecprow) { 663 mbs = a->compressedrow.nrows; 664 ii = a->compressedrow.i; 665 ridx = a->compressedrow.rindex; 666 } else { 667 mbs = a->mbs; 668 ii = a->i; 669 z = zarray; 670 } 671 672 for (i=0; i<mbs; i++) { 673 n = ii[i+1] - ii[i]; 674 idx = ij + ii[i]; 675 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 676 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 677 678 for (j=0; j<n; j++) { 679 xb = x + 15*(idx[j]); 680 681 for (k=0; k<15; k++) { 682 xv = xb[k]; 683 sum1 += v[0]*xv; 684 sum2 += v[1]*xv; 685 sum3 += v[2]*xv; 686 sum4 += v[3]*xv; 687 sum5 += v[4]*xv; 688 sum6 += v[5]*xv; 689 sum7 += v[6]*xv; 690 sum8 += v[7]*xv; 691 sum9 += v[8]*xv; 692 sum10 += v[9]*xv; 693 sum11 += v[10]*xv; 694 sum12 += v[11]*xv; 695 sum13 += v[12]*xv; 696 sum14 += v[13]*xv; 697 sum15 += v[14]*xv; 698 v += 15; 699 } 700 } 701 if (usecprow) z = zarray + 15*ridx[i]; 702 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 703 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 704 705 if (!usecprow) z += 15; 706 } 707 708 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 709 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 710 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 711 PetscFunctionReturn(0); 712 } 713 714 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 717 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 718 { 719 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 720 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 721 const PetscScalar *x,*xb; 722 PetscScalar x1,x2,x3,x4,*zarray; 723 const MatScalar *v; 724 PetscErrorCode ierr; 725 const PetscInt *ii,*ij=a->j,*idx; 726 PetscInt mbs,i,j,n,*ridx=NULL; 727 PetscBool usecprow=a->compressedrow.use; 728 729 PetscFunctionBegin; 730 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 731 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 732 733 v = a->a; 734 if (usecprow) { 735 mbs = a->compressedrow.nrows; 736 ii = a->compressedrow.i; 737 ridx = a->compressedrow.rindex; 738 } else { 739 mbs = a->mbs; 740 ii = a->i; 741 z = zarray; 742 } 743 744 for (i=0; i<mbs; i++) { 745 n = ii[i+1] - ii[i]; 746 idx = ij + ii[i]; 747 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 748 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 749 750 for (j=0; j<n; j++) { 751 xb = x + 15*(idx[j]); 752 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 753 754 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 755 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 756 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 757 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 758 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 759 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 760 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 761 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 762 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 763 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 764 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 765 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 766 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 767 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 768 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 769 770 v += 60; 771 772 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 773 774 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 775 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 776 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 777 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 778 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 779 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 780 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 781 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 782 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 783 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 784 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 785 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 786 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 787 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 788 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 789 v += 60; 790 791 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 792 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 793 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 794 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 795 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 796 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 797 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 798 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 799 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 800 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 801 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 802 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 803 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 804 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 805 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 806 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 807 v += 60; 808 809 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 810 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 811 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 812 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 813 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 814 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 815 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 816 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 817 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 818 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 819 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 820 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 821 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 822 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 823 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 824 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 825 v += 45; 826 } 827 if (usecprow) z = zarray + 15*ridx[i]; 828 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 829 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 830 831 if (!usecprow) z += 15; 832 } 833 834 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 835 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 836 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 837 PetscFunctionReturn(0); 838 } 839 840 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 841 #undef __FUNCT__ 842 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 843 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 844 { 845 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 846 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 847 const PetscScalar *x,*xb; 848 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 849 const MatScalar *v; 850 PetscErrorCode ierr; 851 const PetscInt *ii,*ij=a->j,*idx; 852 PetscInt mbs,i,j,n,*ridx=NULL; 853 PetscBool usecprow=a->compressedrow.use; 854 855 PetscFunctionBegin; 856 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 857 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 858 859 v = a->a; 860 if (usecprow) { 861 mbs = a->compressedrow.nrows; 862 ii = a->compressedrow.i; 863 ridx = a->compressedrow.rindex; 864 } else { 865 mbs = a->mbs; 866 ii = a->i; 867 z = zarray; 868 } 869 870 for (i=0; i<mbs; i++) { 871 n = ii[i+1] - ii[i]; 872 idx = ij + ii[i]; 873 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 874 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 875 876 for (j=0; j<n; j++) { 877 xb = x + 15*(idx[j]); 878 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 879 x8 = xb[7]; 880 881 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 882 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 883 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 884 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 885 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 886 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 887 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 888 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 889 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 890 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 891 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 892 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 893 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 894 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 895 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 896 v += 120; 897 898 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 899 900 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 901 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 902 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 903 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 904 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 905 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 906 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 907 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 908 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 909 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 910 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 911 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 912 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 913 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 914 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 915 v += 105; 916 } 917 if (usecprow) z = zarray + 15*ridx[i]; 918 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 919 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 920 921 if (!usecprow) z += 15; 922 } 923 924 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 925 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 926 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 927 PetscFunctionReturn(0); 928 } 929 930 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 931 932 #undef __FUNCT__ 933 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 934 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 935 { 936 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 937 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 938 const PetscScalar *x,*xb; 939 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 940 const MatScalar *v; 941 PetscErrorCode ierr; 942 const PetscInt *ii,*ij=a->j,*idx; 943 PetscInt mbs,i,j,n,*ridx=NULL; 944 PetscBool usecprow=a->compressedrow.use; 945 946 PetscFunctionBegin; 947 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 948 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 949 950 v = a->a; 951 if (usecprow) { 952 mbs = a->compressedrow.nrows; 953 ii = a->compressedrow.i; 954 ridx = a->compressedrow.rindex; 955 } else { 956 mbs = a->mbs; 957 ii = a->i; 958 z = zarray; 959 } 960 961 for (i=0; i<mbs; i++) { 962 n = ii[i+1] - ii[i]; 963 idx = ij + ii[i]; 964 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 965 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 966 967 for (j=0; j<n; j++) { 968 xb = x + 15*(idx[j]); 969 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 970 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 971 972 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 973 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 974 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 975 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 976 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 977 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 978 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 979 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 980 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 981 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 982 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 983 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 984 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 985 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 986 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 987 v += 225; 988 } 989 if (usecprow) z = zarray + 15*ridx[i]; 990 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 991 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 992 993 if (!usecprow) z += 15; 994 } 995 996 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 997 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 998 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 999 PetscFunctionReturn(0); 1000 } 1001 1002 1003 /* 1004 This will not work with MatScalar == float because it calls the BLAS 1005 */ 1006 #undef __FUNCT__ 1007 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1008 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1009 { 1010 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1011 PetscScalar *z = 0,*work,*workt,*zarray; 1012 const PetscScalar *x,*xb; 1013 const MatScalar *v; 1014 PetscErrorCode ierr; 1015 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1016 const PetscInt *idx,*ii,*ridx=NULL; 1017 PetscInt ncols,k; 1018 PetscBool usecprow=a->compressedrow.use; 1019 1020 PetscFunctionBegin; 1021 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1022 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1023 1024 idx = a->j; 1025 v = a->a; 1026 if (usecprow) { 1027 mbs = a->compressedrow.nrows; 1028 ii = a->compressedrow.i; 1029 ridx = a->compressedrow.rindex; 1030 } else { 1031 mbs = a->mbs; 1032 ii = a->i; 1033 z = zarray; 1034 } 1035 1036 if (!a->mult_work) { 1037 k = PetscMax(A->rmap->n,A->cmap->n); 1038 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1039 } 1040 work = a->mult_work; 1041 for (i=0; i<mbs; i++) { 1042 n = ii[1] - ii[0]; ii++; 1043 ncols = n*bs; 1044 workt = work; 1045 for (j=0; j<n; j++) { 1046 xb = x + bs*(*idx++); 1047 for (k=0; k<bs; k++) workt[k] = xb[k]; 1048 workt += bs; 1049 } 1050 if (usecprow) z = zarray + bs*ridx[i]; 1051 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1052 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1053 v += n*bs2; 1054 if (!usecprow) z += bs; 1055 } 1056 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1057 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1058 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1059 PetscFunctionReturn(0); 1060 } 1061 1062 #undef __FUNCT__ 1063 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1064 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1065 { 1066 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1067 const PetscScalar *x; 1068 PetscScalar *y,*z,sum; 1069 const MatScalar *v; 1070 PetscErrorCode ierr; 1071 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1072 const PetscInt *idx,*ii; 1073 PetscBool usecprow=a->compressedrow.use; 1074 1075 PetscFunctionBegin; 1076 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1077 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1078 1079 idx = a->j; 1080 v = a->a; 1081 if (usecprow) { 1082 if (zz != yy) { 1083 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1084 } 1085 mbs = a->compressedrow.nrows; 1086 ii = a->compressedrow.i; 1087 ridx = a->compressedrow.rindex; 1088 } else { 1089 ii = a->i; 1090 } 1091 1092 for (i=0; i<mbs; i++) { 1093 n = ii[1] - ii[0]; 1094 ii++; 1095 if (!usecprow) { 1096 sum = y[i]; 1097 } else { 1098 sum = y[ridx[i]]; 1099 } 1100 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1101 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1102 PetscSparseDensePlusDot(sum,x,v,idx,n); 1103 v += n; 1104 idx += n; 1105 if (usecprow) { 1106 z[ridx[i]] = sum; 1107 } else { 1108 z[i] = sum; 1109 } 1110 } 1111 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1112 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1113 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1114 PetscFunctionReturn(0); 1115 } 1116 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1119 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1120 { 1121 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1122 PetscScalar *y = 0,*z = 0,sum1,sum2; 1123 const PetscScalar *x,*xb; 1124 PetscScalar x1,x2,*yarray,*zarray; 1125 const MatScalar *v; 1126 PetscErrorCode ierr; 1127 PetscInt mbs = a->mbs,i,n,j; 1128 const PetscInt *idx,*ii,*ridx = NULL; 1129 PetscBool usecprow = a->compressedrow.use; 1130 1131 PetscFunctionBegin; 1132 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1133 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1134 1135 idx = a->j; 1136 v = a->a; 1137 if (usecprow) { 1138 if (zz != yy) { 1139 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1140 } 1141 mbs = a->compressedrow.nrows; 1142 ii = a->compressedrow.i; 1143 ridx = a->compressedrow.rindex; 1144 if (zz != yy) { 1145 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1146 } 1147 } else { 1148 ii = a->i; 1149 y = yarray; 1150 z = zarray; 1151 } 1152 1153 for (i=0; i<mbs; i++) { 1154 n = ii[1] - ii[0]; ii++; 1155 if (usecprow) { 1156 z = zarray + 2*ridx[i]; 1157 y = yarray + 2*ridx[i]; 1158 } 1159 sum1 = y[0]; sum2 = y[1]; 1160 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1161 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1162 for (j=0; j<n; j++) { 1163 xb = x + 2*(*idx++); 1164 x1 = xb[0]; 1165 x2 = xb[1]; 1166 1167 sum1 += v[0]*x1 + v[2]*x2; 1168 sum2 += v[1]*x1 + v[3]*x2; 1169 v += 4; 1170 } 1171 z[0] = sum1; z[1] = sum2; 1172 if (!usecprow) { 1173 z += 2; y += 2; 1174 } 1175 } 1176 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1177 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1178 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1179 PetscFunctionReturn(0); 1180 } 1181 1182 #undef __FUNCT__ 1183 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1184 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1185 { 1186 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1187 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1188 const PetscScalar *x,*xb; 1189 const MatScalar *v; 1190 PetscErrorCode ierr; 1191 PetscInt mbs = a->mbs,i,j,n; 1192 const PetscInt *idx,*ii,*ridx = NULL; 1193 PetscBool usecprow = a->compressedrow.use; 1194 1195 PetscFunctionBegin; 1196 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1197 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1198 1199 idx = a->j; 1200 v = a->a; 1201 if (usecprow) { 1202 if (zz != yy) { 1203 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1204 } 1205 mbs = a->compressedrow.nrows; 1206 ii = a->compressedrow.i; 1207 ridx = a->compressedrow.rindex; 1208 } else { 1209 ii = a->i; 1210 y = yarray; 1211 z = zarray; 1212 } 1213 1214 for (i=0; i<mbs; i++) { 1215 n = ii[1] - ii[0]; ii++; 1216 if (usecprow) { 1217 z = zarray + 3*ridx[i]; 1218 y = yarray + 3*ridx[i]; 1219 } 1220 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1221 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1222 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1223 for (j=0; j<n; j++) { 1224 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1225 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1226 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1227 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1228 v += 9; 1229 } 1230 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1231 if (!usecprow) { 1232 z += 3; y += 3; 1233 } 1234 } 1235 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1236 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1237 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1238 PetscFunctionReturn(0); 1239 } 1240 1241 #undef __FUNCT__ 1242 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1243 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1244 { 1245 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1246 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1247 const PetscScalar *x,*xb; 1248 const MatScalar *v; 1249 PetscErrorCode ierr; 1250 PetscInt mbs = a->mbs,i,j,n; 1251 const PetscInt *idx,*ii,*ridx=NULL; 1252 PetscBool usecprow=a->compressedrow.use; 1253 1254 PetscFunctionBegin; 1255 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1256 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1257 1258 idx = a->j; 1259 v = a->a; 1260 if (usecprow) { 1261 if (zz != yy) { 1262 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1263 } 1264 mbs = a->compressedrow.nrows; 1265 ii = a->compressedrow.i; 1266 ridx = a->compressedrow.rindex; 1267 } else { 1268 ii = a->i; 1269 y = yarray; 1270 z = zarray; 1271 } 1272 1273 for (i=0; i<mbs; i++) { 1274 n = ii[1] - ii[0]; ii++; 1275 if (usecprow) { 1276 z = zarray + 4*ridx[i]; 1277 y = yarray + 4*ridx[i]; 1278 } 1279 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1280 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1281 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1282 for (j=0; j<n; j++) { 1283 xb = x + 4*(*idx++); 1284 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1285 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1286 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1287 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1288 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1289 v += 16; 1290 } 1291 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1292 if (!usecprow) { 1293 z += 4; y += 4; 1294 } 1295 } 1296 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1297 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1298 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1299 PetscFunctionReturn(0); 1300 } 1301 1302 #undef __FUNCT__ 1303 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1304 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1305 { 1306 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1307 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1308 const PetscScalar *x,*xb; 1309 PetscScalar *yarray,*zarray; 1310 const MatScalar *v; 1311 PetscErrorCode ierr; 1312 PetscInt mbs = a->mbs,i,j,n; 1313 const PetscInt *idx,*ii,*ridx = NULL; 1314 PetscBool usecprow=a->compressedrow.use; 1315 1316 PetscFunctionBegin; 1317 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1318 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1319 1320 idx = a->j; 1321 v = a->a; 1322 if (usecprow) { 1323 if (zz != yy) { 1324 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1325 } 1326 mbs = a->compressedrow.nrows; 1327 ii = a->compressedrow.i; 1328 ridx = a->compressedrow.rindex; 1329 } else { 1330 ii = a->i; 1331 y = yarray; 1332 z = zarray; 1333 } 1334 1335 for (i=0; i<mbs; i++) { 1336 n = ii[1] - ii[0]; ii++; 1337 if (usecprow) { 1338 z = zarray + 5*ridx[i]; 1339 y = yarray + 5*ridx[i]; 1340 } 1341 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1342 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1343 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1344 for (j=0; j<n; j++) { 1345 xb = x + 5*(*idx++); 1346 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1347 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1348 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1349 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1350 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1351 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1352 v += 25; 1353 } 1354 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1355 if (!usecprow) { 1356 z += 5; y += 5; 1357 } 1358 } 1359 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1360 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1361 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1362 PetscFunctionReturn(0); 1363 } 1364 #undef __FUNCT__ 1365 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1366 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1367 { 1368 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1369 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1370 const PetscScalar *x,*xb; 1371 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1372 const MatScalar *v; 1373 PetscErrorCode ierr; 1374 PetscInt mbs = a->mbs,i,j,n; 1375 const PetscInt *idx,*ii,*ridx=NULL; 1376 PetscBool usecprow=a->compressedrow.use; 1377 1378 PetscFunctionBegin; 1379 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1380 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1381 1382 idx = a->j; 1383 v = a->a; 1384 if (usecprow) { 1385 if (zz != yy) { 1386 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1387 } 1388 mbs = a->compressedrow.nrows; 1389 ii = a->compressedrow.i; 1390 ridx = a->compressedrow.rindex; 1391 } else { 1392 ii = a->i; 1393 y = yarray; 1394 z = zarray; 1395 } 1396 1397 for (i=0; i<mbs; i++) { 1398 n = ii[1] - ii[0]; ii++; 1399 if (usecprow) { 1400 z = zarray + 6*ridx[i]; 1401 y = yarray + 6*ridx[i]; 1402 } 1403 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1404 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1405 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1406 for (j=0; j<n; j++) { 1407 xb = x + 6*(*idx++); 1408 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1409 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1418 if (!usecprow) { 1419 z += 6; y += 6; 1420 } 1421 } 1422 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1423 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1424 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1425 PetscFunctionReturn(0); 1426 } 1427 1428 #undef __FUNCT__ 1429 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1430 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1431 { 1432 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1433 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1434 const PetscScalar *x,*xb; 1435 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1436 const MatScalar *v; 1437 PetscErrorCode ierr; 1438 PetscInt mbs = a->mbs,i,j,n; 1439 const PetscInt *idx,*ii,*ridx = NULL; 1440 PetscBool usecprow=a->compressedrow.use; 1441 1442 PetscFunctionBegin; 1443 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1444 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1445 1446 idx = a->j; 1447 v = a->a; 1448 if (usecprow) { 1449 if (zz != yy) { 1450 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1451 } 1452 mbs = a->compressedrow.nrows; 1453 ii = a->compressedrow.i; 1454 ridx = a->compressedrow.rindex; 1455 } else { 1456 ii = a->i; 1457 y = yarray; 1458 z = zarray; 1459 } 1460 1461 for (i=0; i<mbs; i++) { 1462 n = ii[1] - ii[0]; ii++; 1463 if (usecprow) { 1464 z = zarray + 7*ridx[i]; 1465 y = yarray + 7*ridx[i]; 1466 } 1467 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1468 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1469 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1470 for (j=0; j<n; j++) { 1471 xb = x + 7*(*idx++); 1472 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1473 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1474 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1475 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1476 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1477 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1478 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1479 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1480 v += 49; 1481 } 1482 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1483 if (!usecprow) { 1484 z += 7; y += 7; 1485 } 1486 } 1487 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1488 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1489 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1490 PetscFunctionReturn(0); 1491 } 1492 1493 #undef __FUNCT__ 1494 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1495 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1496 { 1497 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1498 PetscScalar *z = 0,*work,*workt,*zarray; 1499 const PetscScalar *x,*xb; 1500 const MatScalar *v; 1501 PetscErrorCode ierr; 1502 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1503 PetscInt ncols,k; 1504 const PetscInt *ridx = NULL,*idx,*ii; 1505 PetscBool usecprow = a->compressedrow.use; 1506 1507 PetscFunctionBegin; 1508 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1510 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1511 1512 idx = a->j; 1513 v = a->a; 1514 if (usecprow) { 1515 mbs = a->compressedrow.nrows; 1516 ii = a->compressedrow.i; 1517 ridx = a->compressedrow.rindex; 1518 } else { 1519 mbs = a->mbs; 1520 ii = a->i; 1521 z = zarray; 1522 } 1523 1524 if (!a->mult_work) { 1525 k = PetscMax(A->rmap->n,A->cmap->n); 1526 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1527 } 1528 work = a->mult_work; 1529 for (i=0; i<mbs; i++) { 1530 n = ii[1] - ii[0]; ii++; 1531 ncols = n*bs; 1532 workt = work; 1533 for (j=0; j<n; j++) { 1534 xb = x + bs*(*idx++); 1535 for (k=0; k<bs; k++) workt[k] = xb[k]; 1536 workt += bs; 1537 } 1538 if (usecprow) z = zarray + bs*ridx[i]; 1539 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1540 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1541 v += n*bs2; 1542 if (!usecprow) z += bs; 1543 } 1544 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1545 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1546 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1547 PetscFunctionReturn(0); 1548 } 1549 1550 #undef __FUNCT__ 1551 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1552 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1553 { 1554 PetscScalar zero = 0.0; 1555 PetscErrorCode ierr; 1556 1557 PetscFunctionBegin; 1558 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1559 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1560 PetscFunctionReturn(0); 1561 } 1562 1563 #undef __FUNCT__ 1564 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1565 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1566 { 1567 PetscScalar zero = 0.0; 1568 PetscErrorCode ierr; 1569 1570 PetscFunctionBegin; 1571 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1572 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1573 PetscFunctionReturn(0); 1574 } 1575 1576 #undef __FUNCT__ 1577 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1578 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1579 { 1580 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1581 PetscScalar *z,x1,x2,x3,x4,x5; 1582 const PetscScalar *x,*xb = NULL; 1583 const MatScalar *v; 1584 PetscErrorCode ierr; 1585 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1586 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1587 Mat_CompressedRow cprow = a->compressedrow; 1588 PetscBool usecprow = cprow.use; 1589 1590 PetscFunctionBegin; 1591 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1592 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1593 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1594 1595 idx = a->j; 1596 v = a->a; 1597 if (usecprow) { 1598 mbs = cprow.nrows; 1599 ii = cprow.i; 1600 ridx = cprow.rindex; 1601 } else { 1602 mbs=a->mbs; 1603 ii = a->i; 1604 xb = x; 1605 } 1606 1607 switch (bs) { 1608 case 1: 1609 for (i=0; i<mbs; i++) { 1610 if (usecprow) xb = x + ridx[i]; 1611 x1 = xb[0]; 1612 ib = idx + ii[0]; 1613 n = ii[1] - ii[0]; ii++; 1614 for (j=0; j<n; j++) { 1615 rval = ib[j]; 1616 z[rval] += PetscConj(*v) * x1; 1617 v++; 1618 } 1619 if (!usecprow) xb++; 1620 } 1621 break; 1622 case 2: 1623 for (i=0; i<mbs; i++) { 1624 if (usecprow) xb = x + 2*ridx[i]; 1625 x1 = xb[0]; x2 = xb[1]; 1626 ib = idx + ii[0]; 1627 n = ii[1] - ii[0]; ii++; 1628 for (j=0; j<n; j++) { 1629 rval = ib[j]*2; 1630 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1631 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1632 v += 4; 1633 } 1634 if (!usecprow) xb += 2; 1635 } 1636 break; 1637 case 3: 1638 for (i=0; i<mbs; i++) { 1639 if (usecprow) xb = x + 3*ridx[i]; 1640 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1641 ib = idx + ii[0]; 1642 n = ii[1] - ii[0]; ii++; 1643 for (j=0; j<n; j++) { 1644 rval = ib[j]*3; 1645 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1646 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1647 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1648 v += 9; 1649 } 1650 if (!usecprow) xb += 3; 1651 } 1652 break; 1653 case 4: 1654 for (i=0; i<mbs; i++) { 1655 if (usecprow) xb = x + 4*ridx[i]; 1656 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1657 ib = idx + ii[0]; 1658 n = ii[1] - ii[0]; ii++; 1659 for (j=0; j<n; j++) { 1660 rval = ib[j]*4; 1661 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1662 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1663 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1664 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1665 v += 16; 1666 } 1667 if (!usecprow) xb += 4; 1668 } 1669 break; 1670 case 5: 1671 for (i=0; i<mbs; i++) { 1672 if (usecprow) xb = x + 5*ridx[i]; 1673 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1674 x4 = xb[3]; x5 = xb[4]; 1675 ib = idx + ii[0]; 1676 n = ii[1] - ii[0]; ii++; 1677 for (j=0; j<n; j++) { 1678 rval = ib[j]*5; 1679 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1680 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1681 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1682 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1683 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1684 v += 25; 1685 } 1686 if (!usecprow) xb += 5; 1687 } 1688 break; 1689 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1690 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1691 #if 0 1692 { 1693 PetscInt ncols,k,bs2=a->bs2; 1694 PetscScalar *work,*workt,zb; 1695 const PetscScalar *xtmp; 1696 if (!a->mult_work) { 1697 k = PetscMax(A->rmap->n,A->cmap->n); 1698 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1699 } 1700 work = a->mult_work; 1701 xtmp = x; 1702 for (i=0; i<mbs; i++) { 1703 n = ii[1] - ii[0]; ii++; 1704 ncols = n*bs; 1705 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1706 if (usecprow) xtmp = x + bs*ridx[i]; 1707 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1708 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1709 v += n*bs2; 1710 if (!usecprow) xtmp += bs; 1711 workt = work; 1712 for (j=0; j<n; j++) { 1713 zb = z + bs*(*idx++); 1714 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1715 workt += bs; 1716 } 1717 } 1718 } 1719 #endif 1720 } 1721 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1722 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1723 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1724 PetscFunctionReturn(0); 1725 } 1726 1727 #undef __FUNCT__ 1728 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1729 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1730 { 1731 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1732 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1733 const PetscScalar *x,*xb = 0; 1734 const MatScalar *v; 1735 PetscErrorCode ierr; 1736 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1737 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1738 Mat_CompressedRow cprow = a->compressedrow; 1739 PetscBool usecprow=cprow.use; 1740 1741 PetscFunctionBegin; 1742 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1743 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1744 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1745 1746 idx = a->j; 1747 v = a->a; 1748 if (usecprow) { 1749 mbs = cprow.nrows; 1750 ii = cprow.i; 1751 ridx = cprow.rindex; 1752 } else { 1753 mbs=a->mbs; 1754 ii = a->i; 1755 xb = x; 1756 } 1757 1758 switch (bs) { 1759 case 1: 1760 for (i=0; i<mbs; i++) { 1761 if (usecprow) xb = x + ridx[i]; 1762 x1 = xb[0]; 1763 ib = idx + ii[0]; 1764 n = ii[1] - ii[0]; ii++; 1765 for (j=0; j<n; j++) { 1766 rval = ib[j]; 1767 z[rval] += *v * x1; 1768 v++; 1769 } 1770 if (!usecprow) xb++; 1771 } 1772 break; 1773 case 2: 1774 for (i=0; i<mbs; i++) { 1775 if (usecprow) xb = x + 2*ridx[i]; 1776 x1 = xb[0]; x2 = xb[1]; 1777 ib = idx + ii[0]; 1778 n = ii[1] - ii[0]; ii++; 1779 for (j=0; j<n; j++) { 1780 rval = ib[j]*2; 1781 z[rval++] += v[0]*x1 + v[1]*x2; 1782 z[rval++] += v[2]*x1 + v[3]*x2; 1783 v += 4; 1784 } 1785 if (!usecprow) xb += 2; 1786 } 1787 break; 1788 case 3: 1789 for (i=0; i<mbs; i++) { 1790 if (usecprow) xb = x + 3*ridx[i]; 1791 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1792 ib = idx + ii[0]; 1793 n = ii[1] - ii[0]; ii++; 1794 for (j=0; j<n; j++) { 1795 rval = ib[j]*3; 1796 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1797 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1798 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1799 v += 9; 1800 } 1801 if (!usecprow) xb += 3; 1802 } 1803 break; 1804 case 4: 1805 for (i=0; i<mbs; i++) { 1806 if (usecprow) xb = x + 4*ridx[i]; 1807 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1808 ib = idx + ii[0]; 1809 n = ii[1] - ii[0]; ii++; 1810 for (j=0; j<n; j++) { 1811 rval = ib[j]*4; 1812 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1813 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1814 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1815 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1816 v += 16; 1817 } 1818 if (!usecprow) xb += 4; 1819 } 1820 break; 1821 case 5: 1822 for (i=0; i<mbs; i++) { 1823 if (usecprow) xb = x + 5*ridx[i]; 1824 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1825 x4 = xb[3]; x5 = xb[4]; 1826 ib = idx + ii[0]; 1827 n = ii[1] - ii[0]; ii++; 1828 for (j=0; j<n; j++) { 1829 rval = ib[j]*5; 1830 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1831 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1832 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1833 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1834 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1835 v += 25; 1836 } 1837 if (!usecprow) xb += 5; 1838 } 1839 break; 1840 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1841 PetscInt ncols,k; 1842 PetscScalar *work,*workt; 1843 const PetscScalar *xtmp; 1844 if (!a->mult_work) { 1845 k = PetscMax(A->rmap->n,A->cmap->n); 1846 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1847 } 1848 work = a->mult_work; 1849 xtmp = x; 1850 for (i=0; i<mbs; i++) { 1851 n = ii[1] - ii[0]; ii++; 1852 ncols = n*bs; 1853 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1854 if (usecprow) xtmp = x + bs*ridx[i]; 1855 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1856 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1857 v += n*bs2; 1858 if (!usecprow) xtmp += bs; 1859 workt = work; 1860 for (j=0; j<n; j++) { 1861 zb = z + bs*(*idx++); 1862 for (k=0; k<bs; k++) zb[k] += workt[k]; 1863 workt += bs; 1864 } 1865 } 1866 } 1867 } 1868 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1869 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1870 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1871 PetscFunctionReturn(0); 1872 } 1873 1874 #undef __FUNCT__ 1875 #define __FUNCT__ "MatScale_SeqBAIJ" 1876 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1877 { 1878 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1879 PetscInt totalnz = a->bs2*a->nz; 1880 PetscScalar oalpha = alpha; 1881 PetscErrorCode ierr; 1882 PetscBLASInt one = 1,tnz; 1883 1884 PetscFunctionBegin; 1885 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1886 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1887 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1888 PetscFunctionReturn(0); 1889 } 1890 1891 #undef __FUNCT__ 1892 #define __FUNCT__ "MatNorm_SeqBAIJ" 1893 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1894 { 1895 PetscErrorCode ierr; 1896 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1897 MatScalar *v = a->a; 1898 PetscReal sum = 0.0; 1899 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1900 1901 PetscFunctionBegin; 1902 if (type == NORM_FROBENIUS) { 1903 for (i=0; i< bs2*nz; i++) { 1904 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1905 } 1906 *norm = PetscSqrtReal(sum); 1907 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 1908 } else if (type == NORM_1) { /* maximum column sum */ 1909 PetscReal *tmp; 1910 PetscInt *bcol = a->j; 1911 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 1912 for (i=0; i<nz; i++) { 1913 for (j=0; j<bs; j++) { 1914 k1 = bs*(*bcol) + j; /* column index */ 1915 for (k=0; k<bs; k++) { 1916 tmp[k1] += PetscAbsScalar(*v); v++; 1917 } 1918 } 1919 bcol++; 1920 } 1921 *norm = 0.0; 1922 for (j=0; j<A->cmap->n; j++) { 1923 if (tmp[j] > *norm) *norm = tmp[j]; 1924 } 1925 ierr = PetscFree(tmp);CHKERRQ(ierr); 1926 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 1927 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1928 *norm = 0.0; 1929 for (k=0; k<bs; k++) { 1930 for (j=0; j<a->mbs; j++) { 1931 v = a->a + bs2*a->i[j] + k; 1932 sum = 0.0; 1933 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1934 for (k1=0; k1<bs; k1++) { 1935 sum += PetscAbsScalar(*v); 1936 v += bs; 1937 } 1938 } 1939 if (sum > *norm) *norm = sum; 1940 } 1941 } 1942 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 1943 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1944 PetscFunctionReturn(0); 1945 } 1946 1947 1948 #undef __FUNCT__ 1949 #define __FUNCT__ "MatEqual_SeqBAIJ" 1950 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1951 { 1952 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1953 PetscErrorCode ierr; 1954 1955 PetscFunctionBegin; 1956 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1957 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1958 *flg = PETSC_FALSE; 1959 PetscFunctionReturn(0); 1960 } 1961 1962 /* if the a->i are the same */ 1963 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1964 if (!*flg) PetscFunctionReturn(0); 1965 1966 /* if a->j are the same */ 1967 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1968 if (!*flg) PetscFunctionReturn(0); 1969 1970 /* if a->a are the same */ 1971 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1972 PetscFunctionReturn(0); 1973 1974 } 1975 1976 #undef __FUNCT__ 1977 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1978 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1979 { 1980 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1981 PetscErrorCode ierr; 1982 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1983 PetscScalar *x,zero = 0.0; 1984 MatScalar *aa,*aa_j; 1985 1986 PetscFunctionBegin; 1987 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1988 bs = A->rmap->bs; 1989 aa = a->a; 1990 ai = a->i; 1991 aj = a->j; 1992 ambs = a->mbs; 1993 bs2 = a->bs2; 1994 1995 ierr = VecSet(v,zero);CHKERRQ(ierr); 1996 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1997 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1998 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1999 for (i=0; i<ambs; i++) { 2000 for (j=ai[i]; j<ai[i+1]; j++) { 2001 if (aj[j] == i) { 2002 row = i*bs; 2003 aa_j = aa+j*bs2; 2004 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2005 break; 2006 } 2007 } 2008 } 2009 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2010 PetscFunctionReturn(0); 2011 } 2012 2013 #undef __FUNCT__ 2014 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2015 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2016 { 2017 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2018 const PetscScalar *l,*r,*li,*ri; 2019 PetscScalar x; 2020 MatScalar *aa, *v; 2021 PetscErrorCode ierr; 2022 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2023 const PetscInt *ai,*aj; 2024 2025 PetscFunctionBegin; 2026 ai = a->i; 2027 aj = a->j; 2028 aa = a->a; 2029 m = A->rmap->n; 2030 n = A->cmap->n; 2031 bs = A->rmap->bs; 2032 mbs = a->mbs; 2033 bs2 = a->bs2; 2034 if (ll) { 2035 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2036 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2037 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2038 for (i=0; i<mbs; i++) { /* for each block row */ 2039 M = ai[i+1] - ai[i]; 2040 li = l + i*bs; 2041 v = aa + bs2*ai[i]; 2042 for (j=0; j<M; j++) { /* for each block */ 2043 for (k=0; k<bs2; k++) { 2044 (*v++) *= li[k%bs]; 2045 } 2046 } 2047 } 2048 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2049 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2050 } 2051 2052 if (rr) { 2053 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2054 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2055 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2056 for (i=0; i<mbs; i++) { /* for each block row */ 2057 iai = ai[i]; 2058 M = ai[i+1] - iai; 2059 v = aa + bs2*iai; 2060 for (j=0; j<M; j++) { /* for each block */ 2061 ri = r + bs*aj[iai+j]; 2062 for (k=0; k<bs; k++) { 2063 x = ri[k]; 2064 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2065 v += bs; 2066 } 2067 } 2068 } 2069 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2070 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2071 } 2072 PetscFunctionReturn(0); 2073 } 2074 2075 2076 #undef __FUNCT__ 2077 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2078 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2079 { 2080 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2081 2082 PetscFunctionBegin; 2083 info->block_size = a->bs2; 2084 info->nz_allocated = a->bs2*a->maxnz; 2085 info->nz_used = a->bs2*a->nz; 2086 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2087 info->assemblies = A->num_ass; 2088 info->mallocs = A->info.mallocs; 2089 info->memory = ((PetscObject)A)->mem; 2090 if (A->factortype) { 2091 info->fill_ratio_given = A->info.fill_ratio_given; 2092 info->fill_ratio_needed = A->info.fill_ratio_needed; 2093 info->factor_mallocs = A->info.factor_mallocs; 2094 } else { 2095 info->fill_ratio_given = 0; 2096 info->fill_ratio_needed = 0; 2097 info->factor_mallocs = 0; 2098 } 2099 PetscFunctionReturn(0); 2100 } 2101 2102 #undef __FUNCT__ 2103 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2104 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2105 { 2106 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2107 PetscErrorCode ierr; 2108 2109 PetscFunctionBegin; 2110 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2111 PetscFunctionReturn(0); 2112 } 2113