1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n; ++j) { 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 64 } 65 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 66 } 67 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 68 ierr = PetscFree(nidx);CHKERRQ(ierr); 69 ierr = PetscFree(nidx2);CHKERRQ(ierr); 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag; 87 88 PetscFunctionBegin; 89 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 145 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 146 for (i=0; i<nrows; i++) { 147 PetscInt ilen; 148 mat_i = c->i[i]; 149 mat_j = c->j + mat_i; 150 mat_a = c->a + mat_i*bs2; 151 ilen = c->ilen[i]; 152 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 153 } 154 ierr = PetscFree(work);CHKERRQ(ierr); 155 } 156 157 /* Free work space */ 158 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 159 ierr = PetscFree(smap);CHKERRQ(ierr); 160 ierr = PetscFree(lens);CHKERRQ(ierr); 161 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 163 164 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 165 *B = C; 166 PetscFunctionReturn(0); 167 } 168 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 171 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 172 { 173 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 174 IS is1,is2; 175 PetscErrorCode ierr; 176 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 177 const PetscInt *irow,*icol; 178 179 PetscFunctionBegin; 180 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 181 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 182 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 183 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 184 185 /* Verify if the indices corespond to each element in a block 186 and form the IS with compressed IS */ 187 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 188 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 189 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 190 for (i=0; i<a->mbs; i++) { 191 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 192 } 193 count = 0; 194 for (i=0; i<nrows; i++) { 195 PetscInt j = irow[i] / bs; 196 if ((vary[j]--)==bs) iary[count++] = j; 197 } 198 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 199 200 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 201 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 204 } 205 count = 0; 206 for (i=0; i<ncols; i++) { 207 PetscInt j = icol[i] / bs; 208 if ((vary[j]--)==bs) iary[count++] = j; 209 } 210 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 213 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 214 215 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 216 ierr = ISDestroy(&is1);CHKERRQ(ierr); 217 ierr = ISDestroy(&is2);CHKERRQ(ierr); 218 PetscFunctionReturn(0); 219 } 220 221 #undef __FUNCT__ 222 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 223 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 224 { 225 PetscErrorCode ierr; 226 PetscInt i; 227 228 PetscFunctionBegin; 229 if (scall == MAT_INITIAL_MATRIX) { 230 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 231 } 232 233 for (i=0; i<n; i++) { 234 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 235 } 236 PetscFunctionReturn(0); 237 } 238 239 240 /* -------------------------------------------------------*/ 241 /* Should check that shapes of vectors and matrices match */ 242 /* -------------------------------------------------------*/ 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatMult_SeqBAIJ_1" 246 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 247 { 248 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 249 PetscScalar *z,sum; 250 const PetscScalar *x; 251 const MatScalar *v; 252 PetscErrorCode ierr; 253 PetscInt mbs,i,n; 254 const PetscInt *idx,*ii,*ridx=NULL; 255 PetscBool usecprow=a->compressedrow.use; 256 257 PetscFunctionBegin; 258 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 259 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 260 261 if (usecprow) { 262 mbs = a->compressedrow.nrows; 263 ii = a->compressedrow.i; 264 ridx = a->compressedrow.rindex; 265 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 266 } else { 267 mbs = a->mbs; 268 ii = a->i; 269 } 270 271 for (i=0; i<mbs; i++) { 272 n = ii[1] - ii[0]; 273 v = a->a + ii[0]; 274 idx = a->j + ii[0]; 275 ii++; 276 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 277 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 278 sum = 0.0; 279 PetscSparseDensePlusDot(sum,x,v,idx,n); 280 if (usecprow) { 281 z[ridx[i]] = sum; 282 } else { 283 z[i] = sum; 284 } 285 } 286 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 287 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 288 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 #undef __FUNCT__ 293 #define __FUNCT__ "MatMult_SeqBAIJ_2" 294 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 295 { 296 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 297 PetscScalar *z = 0,sum1,sum2,*zarray; 298 const PetscScalar *x,*xb; 299 PetscScalar x1,x2; 300 const MatScalar *v; 301 PetscErrorCode ierr; 302 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 303 PetscBool usecprow=a->compressedrow.use; 304 305 PetscFunctionBegin; 306 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 307 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 308 309 idx = a->j; 310 v = a->a; 311 if (usecprow) { 312 mbs = a->compressedrow.nrows; 313 ii = a->compressedrow.i; 314 ridx = a->compressedrow.rindex; 315 } else { 316 mbs = a->mbs; 317 ii = a->i; 318 z = zarray; 319 } 320 321 for (i=0; i<mbs; i++) { 322 n = ii[1] - ii[0]; ii++; 323 sum1 = 0.0; sum2 = 0.0; 324 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 325 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 326 for (j=0; j<n; j++) { 327 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 328 sum1 += v[0]*x1 + v[2]*x2; 329 sum2 += v[1]*x1 + v[3]*x2; 330 v += 4; 331 } 332 if (usecprow) z = zarray + 2*ridx[i]; 333 z[0] = sum1; z[1] = sum2; 334 if (!usecprow) z += 2; 335 } 336 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 337 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 338 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 339 PetscFunctionReturn(0); 340 } 341 342 #undef __FUNCT__ 343 #define __FUNCT__ "MatMult_SeqBAIJ_3" 344 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 345 { 346 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 347 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 348 const PetscScalar *x,*xb; 349 const MatScalar *v; 350 PetscErrorCode ierr; 351 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 352 PetscBool usecprow=a->compressedrow.use; 353 354 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 355 #pragma disjoint(*v,*z,*xb) 356 #endif 357 358 PetscFunctionBegin; 359 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 360 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 361 362 idx = a->j; 363 v = a->a; 364 if (usecprow) { 365 mbs = a->compressedrow.nrows; 366 ii = a->compressedrow.i; 367 ridx = a->compressedrow.rindex; 368 } else { 369 mbs = a->mbs; 370 ii = a->i; 371 z = zarray; 372 } 373 374 for (i=0; i<mbs; i++) { 375 n = ii[1] - ii[0]; ii++; 376 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 377 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 378 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 379 for (j=0; j<n; j++) { 380 xb = x + 3*(*idx++); 381 x1 = xb[0]; 382 x2 = xb[1]; 383 x3 = xb[2]; 384 385 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 386 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 387 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 388 v += 9; 389 } 390 if (usecprow) z = zarray + 3*ridx[i]; 391 z[0] = sum1; z[1] = sum2; z[2] = sum3; 392 if (!usecprow) z += 3; 393 } 394 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 395 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 396 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 #undef __FUNCT__ 401 #define __FUNCT__ "MatMult_SeqBAIJ_4" 402 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 403 { 404 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 405 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 406 const PetscScalar *x,*xb; 407 const MatScalar *v; 408 PetscErrorCode ierr; 409 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 410 PetscBool usecprow=a->compressedrow.use; 411 412 PetscFunctionBegin; 413 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 414 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 415 416 idx = a->j; 417 v = a->a; 418 if (usecprow) { 419 mbs = a->compressedrow.nrows; 420 ii = a->compressedrow.i; 421 ridx = a->compressedrow.rindex; 422 } else { 423 mbs = a->mbs; 424 ii = a->i; 425 z = zarray; 426 } 427 428 for (i=0; i<mbs; i++) { 429 n = ii[1] - ii[0]; 430 ii++; 431 sum1 = 0.0; 432 sum2 = 0.0; 433 sum3 = 0.0; 434 sum4 = 0.0; 435 436 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 437 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 438 for (j=0; j<n; j++) { 439 xb = x + 4*(*idx++); 440 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 441 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 442 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 443 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 444 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 445 v += 16; 446 } 447 if (usecprow) z = zarray + 4*ridx[i]; 448 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 449 if (!usecprow) z += 4; 450 } 451 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 452 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 453 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 454 PetscFunctionReturn(0); 455 } 456 457 #undef __FUNCT__ 458 #define __FUNCT__ "MatMult_SeqBAIJ_5" 459 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 460 { 461 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 462 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 463 const PetscScalar *xb,*x; 464 const MatScalar *v; 465 PetscErrorCode ierr; 466 const PetscInt *idx,*ii,*ridx=NULL; 467 PetscInt mbs,i,j,n; 468 PetscBool usecprow=a->compressedrow.use; 469 470 PetscFunctionBegin; 471 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 472 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 473 474 idx = a->j; 475 v = a->a; 476 if (usecprow) { 477 mbs = a->compressedrow.nrows; 478 ii = a->compressedrow.i; 479 ridx = a->compressedrow.rindex; 480 } else { 481 mbs = a->mbs; 482 ii = a->i; 483 z = zarray; 484 } 485 486 for (i=0; i<mbs; i++) { 487 n = ii[1] - ii[0]; ii++; 488 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 489 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 490 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 491 for (j=0; j<n; j++) { 492 xb = x + 5*(*idx++); 493 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 494 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 495 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 496 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 497 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 498 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 499 v += 25; 500 } 501 if (usecprow) z = zarray + 5*ridx[i]; 502 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 503 if (!usecprow) z += 5; 504 } 505 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 506 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 507 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 508 PetscFunctionReturn(0); 509 } 510 511 512 513 #undef __FUNCT__ 514 #define __FUNCT__ "MatMult_SeqBAIJ_6" 515 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 516 { 517 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 518 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 519 const PetscScalar *x,*xb; 520 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 521 const MatScalar *v; 522 PetscErrorCode ierr; 523 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 524 PetscBool usecprow=a->compressedrow.use; 525 526 PetscFunctionBegin; 527 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 528 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 529 530 idx = a->j; 531 v = a->a; 532 if (usecprow) { 533 mbs = a->compressedrow.nrows; 534 ii = a->compressedrow.i; 535 ridx = a->compressedrow.rindex; 536 } else { 537 mbs = a->mbs; 538 ii = a->i; 539 z = zarray; 540 } 541 542 for (i=0; i<mbs; i++) { 543 n = ii[1] - ii[0]; 544 ii++; 545 sum1 = 0.0; 546 sum2 = 0.0; 547 sum3 = 0.0; 548 sum4 = 0.0; 549 sum5 = 0.0; 550 sum6 = 0.0; 551 552 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 553 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 554 for (j=0; j<n; j++) { 555 xb = x + 6*(*idx++); 556 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 557 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 558 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 559 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 560 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 561 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 562 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 563 v += 36; 564 } 565 if (usecprow) z = zarray + 6*ridx[i]; 566 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 567 if (!usecprow) z += 6; 568 } 569 570 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 571 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 572 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 #undef __FUNCT__ 577 #define __FUNCT__ "MatMult_SeqBAIJ_7" 578 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 579 { 580 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 581 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 582 const PetscScalar *x,*xb; 583 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 584 const MatScalar *v; 585 PetscErrorCode ierr; 586 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 587 PetscBool usecprow=a->compressedrow.use; 588 589 PetscFunctionBegin; 590 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 591 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 592 593 idx = a->j; 594 v = a->a; 595 if (usecprow) { 596 mbs = a->compressedrow.nrows; 597 ii = a->compressedrow.i; 598 ridx = a->compressedrow.rindex; 599 } else { 600 mbs = a->mbs; 601 ii = a->i; 602 z = zarray; 603 } 604 605 for (i=0; i<mbs; i++) { 606 n = ii[1] - ii[0]; 607 ii++; 608 sum1 = 0.0; 609 sum2 = 0.0; 610 sum3 = 0.0; 611 sum4 = 0.0; 612 sum5 = 0.0; 613 sum6 = 0.0; 614 sum7 = 0.0; 615 616 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 617 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 618 for (j=0; j<n; j++) { 619 xb = x + 7*(*idx++); 620 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 621 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 622 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 623 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 624 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 625 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 626 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 627 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 628 v += 49; 629 } 630 if (usecprow) z = zarray + 7*ridx[i]; 631 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 632 if (!usecprow) z += 7; 633 } 634 635 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 636 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 637 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 638 PetscFunctionReturn(0); 639 } 640 641 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 642 /* Default MatMult for block size 15 */ 643 644 #undef __FUNCT__ 645 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 646 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 647 { 648 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 649 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 650 const PetscScalar *x,*xb; 651 PetscScalar *zarray,xv; 652 const MatScalar *v; 653 PetscErrorCode ierr; 654 const PetscInt *ii,*ij=a->j,*idx; 655 PetscInt mbs,i,j,k,n,*ridx=NULL; 656 PetscBool usecprow=a->compressedrow.use; 657 658 PetscFunctionBegin; 659 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 660 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 661 662 v = a->a; 663 if (usecprow) { 664 mbs = a->compressedrow.nrows; 665 ii = a->compressedrow.i; 666 ridx = a->compressedrow.rindex; 667 } else { 668 mbs = a->mbs; 669 ii = a->i; 670 z = zarray; 671 } 672 673 for (i=0; i<mbs; i++) { 674 n = ii[i+1] - ii[i]; 675 idx = ij + ii[i]; 676 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 677 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 678 679 for (j=0; j<n; j++) { 680 xb = x + 15*(idx[j]); 681 682 for (k=0; k<15; k++) { 683 xv = xb[k]; 684 sum1 += v[0]*xv; 685 sum2 += v[1]*xv; 686 sum3 += v[2]*xv; 687 sum4 += v[3]*xv; 688 sum5 += v[4]*xv; 689 sum6 += v[5]*xv; 690 sum7 += v[6]*xv; 691 sum8 += v[7]*xv; 692 sum9 += v[8]*xv; 693 sum10 += v[9]*xv; 694 sum11 += v[10]*xv; 695 sum12 += v[11]*xv; 696 sum13 += v[12]*xv; 697 sum14 += v[13]*xv; 698 sum15 += v[14]*xv; 699 v += 15; 700 } 701 } 702 if (usecprow) z = zarray + 15*ridx[i]; 703 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 704 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 705 706 if (!usecprow) z += 15; 707 } 708 709 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 710 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 711 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 716 #undef __FUNCT__ 717 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 718 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 719 { 720 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 721 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 722 const PetscScalar *x,*xb; 723 PetscScalar x1,x2,x3,x4,*zarray; 724 const MatScalar *v; 725 PetscErrorCode ierr; 726 const PetscInt *ii,*ij=a->j,*idx; 727 PetscInt mbs,i,j,n,*ridx=NULL; 728 PetscBool usecprow=a->compressedrow.use; 729 730 PetscFunctionBegin; 731 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 732 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 733 734 v = a->a; 735 if (usecprow) { 736 mbs = a->compressedrow.nrows; 737 ii = a->compressedrow.i; 738 ridx = a->compressedrow.rindex; 739 } else { 740 mbs = a->mbs; 741 ii = a->i; 742 z = zarray; 743 } 744 745 for (i=0; i<mbs; i++) { 746 n = ii[i+1] - ii[i]; 747 idx = ij + ii[i]; 748 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 749 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 750 751 for (j=0; j<n; j++) { 752 xb = x + 15*(idx[j]); 753 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 754 755 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 756 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 757 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 758 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 759 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 760 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 761 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 762 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 763 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 764 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 765 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 766 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 767 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 768 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 769 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 770 771 v += 60; 772 773 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 774 775 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 776 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 777 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 778 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 779 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 780 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 781 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 782 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 783 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 784 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 785 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 786 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 787 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 788 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 789 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 790 v += 60; 791 792 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 793 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 794 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 795 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 796 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 797 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 798 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 799 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 800 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 801 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 802 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 803 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 804 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 805 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 806 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 807 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 808 v += 60; 809 810 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 811 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 812 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 813 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 814 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 815 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 816 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 817 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 818 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 819 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 820 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 821 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 822 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 823 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 824 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 825 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 826 v += 45; 827 } 828 if (usecprow) z = zarray + 15*ridx[i]; 829 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 830 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 831 832 if (!usecprow) z += 15; 833 } 834 835 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 836 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 837 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 838 PetscFunctionReturn(0); 839 } 840 841 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 842 #undef __FUNCT__ 843 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 844 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 845 { 846 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 847 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 848 const PetscScalar *x,*xb; 849 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 850 const MatScalar *v; 851 PetscErrorCode ierr; 852 const PetscInt *ii,*ij=a->j,*idx; 853 PetscInt mbs,i,j,n,*ridx=NULL; 854 PetscBool usecprow=a->compressedrow.use; 855 856 PetscFunctionBegin; 857 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 858 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 859 860 v = a->a; 861 if (usecprow) { 862 mbs = a->compressedrow.nrows; 863 ii = a->compressedrow.i; 864 ridx = a->compressedrow.rindex; 865 } else { 866 mbs = a->mbs; 867 ii = a->i; 868 z = zarray; 869 } 870 871 for (i=0; i<mbs; i++) { 872 n = ii[i+1] - ii[i]; 873 idx = ij + ii[i]; 874 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 875 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 876 877 for (j=0; j<n; j++) { 878 xb = x + 15*(idx[j]); 879 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 880 x8 = xb[7]; 881 882 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 883 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 884 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 885 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 886 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 887 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 888 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 889 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 890 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 891 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 892 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 893 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 894 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 895 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 896 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 897 v += 120; 898 899 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 900 901 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 902 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 903 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 904 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 905 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 906 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 907 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 908 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 909 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 910 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 911 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 912 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 913 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 914 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 915 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 916 v += 105; 917 } 918 if (usecprow) z = zarray + 15*ridx[i]; 919 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 920 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 921 922 if (!usecprow) z += 15; 923 } 924 925 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 926 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 927 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 931 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 932 933 #undef __FUNCT__ 934 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 935 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 936 { 937 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 938 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 939 const PetscScalar *x,*xb; 940 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 941 const MatScalar *v; 942 PetscErrorCode ierr; 943 const PetscInt *ii,*ij=a->j,*idx; 944 PetscInt mbs,i,j,n,*ridx=NULL; 945 PetscBool usecprow=a->compressedrow.use; 946 947 PetscFunctionBegin; 948 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 949 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 950 951 v = a->a; 952 if (usecprow) { 953 mbs = a->compressedrow.nrows; 954 ii = a->compressedrow.i; 955 ridx = a->compressedrow.rindex; 956 } else { 957 mbs = a->mbs; 958 ii = a->i; 959 z = zarray; 960 } 961 962 for (i=0; i<mbs; i++) { 963 n = ii[i+1] - ii[i]; 964 idx = ij + ii[i]; 965 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 966 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 967 968 for (j=0; j<n; j++) { 969 xb = x + 15*(idx[j]); 970 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 971 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 972 973 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 974 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 975 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 976 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 977 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 978 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 979 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 980 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 981 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 982 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 983 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 984 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 985 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 986 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 987 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 988 v += 225; 989 } 990 if (usecprow) z = zarray + 15*ridx[i]; 991 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 992 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 993 994 if (!usecprow) z += 15; 995 } 996 997 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 998 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 999 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1000 PetscFunctionReturn(0); 1001 } 1002 1003 1004 /* 1005 This will not work with MatScalar == float because it calls the BLAS 1006 */ 1007 #undef __FUNCT__ 1008 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1009 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1010 { 1011 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1012 PetscScalar *z = 0,*work,*workt,*zarray; 1013 const PetscScalar *x,*xb; 1014 const MatScalar *v; 1015 PetscErrorCode ierr; 1016 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1017 const PetscInt *idx,*ii,*ridx=NULL; 1018 PetscInt ncols,k; 1019 PetscBool usecprow=a->compressedrow.use; 1020 1021 PetscFunctionBegin; 1022 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1023 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1024 1025 idx = a->j; 1026 v = a->a; 1027 if (usecprow) { 1028 mbs = a->compressedrow.nrows; 1029 ii = a->compressedrow.i; 1030 ridx = a->compressedrow.rindex; 1031 } else { 1032 mbs = a->mbs; 1033 ii = a->i; 1034 z = zarray; 1035 } 1036 1037 if (!a->mult_work) { 1038 k = PetscMax(A->rmap->n,A->cmap->n); 1039 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1040 } 1041 work = a->mult_work; 1042 for (i=0; i<mbs; i++) { 1043 n = ii[1] - ii[0]; ii++; 1044 ncols = n*bs; 1045 workt = work; 1046 for (j=0; j<n; j++) { 1047 xb = x + bs*(*idx++); 1048 for (k=0; k<bs; k++) workt[k] = xb[k]; 1049 workt += bs; 1050 } 1051 if (usecprow) z = zarray + bs*ridx[i]; 1052 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1053 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1054 v += n*bs2; 1055 if (!usecprow) z += bs; 1056 } 1057 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1058 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1059 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1060 PetscFunctionReturn(0); 1061 } 1062 1063 #undef __FUNCT__ 1064 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1065 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1066 { 1067 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1068 const PetscScalar *x; 1069 PetscScalar *y,*z,sum; 1070 const MatScalar *v; 1071 PetscErrorCode ierr; 1072 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1073 const PetscInt *idx,*ii; 1074 PetscBool usecprow=a->compressedrow.use; 1075 1076 PetscFunctionBegin; 1077 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1078 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1079 1080 idx = a->j; 1081 v = a->a; 1082 if (usecprow) { 1083 if (zz != yy) { 1084 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1085 } 1086 mbs = a->compressedrow.nrows; 1087 ii = a->compressedrow.i; 1088 ridx = a->compressedrow.rindex; 1089 } else { 1090 ii = a->i; 1091 } 1092 1093 for (i=0; i<mbs; i++) { 1094 n = ii[1] - ii[0]; 1095 ii++; 1096 if (!usecprow) { 1097 sum = y[i]; 1098 } else { 1099 sum = y[ridx[i]]; 1100 } 1101 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1102 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1103 PetscSparseDensePlusDot(sum,x,v,idx,n); 1104 v += n; 1105 idx += n; 1106 if (usecprow) { 1107 z[ridx[i]] = sum; 1108 } else { 1109 z[i] = sum; 1110 } 1111 } 1112 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1113 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 #undef __FUNCT__ 1119 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1120 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1121 { 1122 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1123 PetscScalar *y = 0,*z = 0,sum1,sum2; 1124 const PetscScalar *x,*xb; 1125 PetscScalar x1,x2,*yarray,*zarray; 1126 const MatScalar *v; 1127 PetscErrorCode ierr; 1128 PetscInt mbs = a->mbs,i,n,j; 1129 const PetscInt *idx,*ii,*ridx = NULL; 1130 PetscBool usecprow = a->compressedrow.use; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1134 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1135 1136 idx = a->j; 1137 v = a->a; 1138 if (usecprow) { 1139 if (zz != yy) { 1140 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1141 } 1142 mbs = a->compressedrow.nrows; 1143 ii = a->compressedrow.i; 1144 ridx = a->compressedrow.rindex; 1145 if (zz != yy) { 1146 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1147 } 1148 } else { 1149 ii = a->i; 1150 y = yarray; 1151 z = zarray; 1152 } 1153 1154 for (i=0; i<mbs; i++) { 1155 n = ii[1] - ii[0]; ii++; 1156 if (usecprow) { 1157 z = zarray + 2*ridx[i]; 1158 y = yarray + 2*ridx[i]; 1159 } 1160 sum1 = y[0]; sum2 = y[1]; 1161 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1162 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1163 for (j=0; j<n; j++) { 1164 xb = x + 2*(*idx++); 1165 x1 = xb[0]; 1166 x2 = xb[1]; 1167 1168 sum1 += v[0]*x1 + v[2]*x2; 1169 sum2 += v[1]*x1 + v[3]*x2; 1170 v += 4; 1171 } 1172 z[0] = sum1; z[1] = sum2; 1173 if (!usecprow) { 1174 z += 2; y += 2; 1175 } 1176 } 1177 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1178 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1179 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 #undef __FUNCT__ 1184 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1185 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1186 { 1187 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1188 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1189 const PetscScalar *x,*xb; 1190 const MatScalar *v; 1191 PetscErrorCode ierr; 1192 PetscInt mbs = a->mbs,i,j,n; 1193 const PetscInt *idx,*ii,*ridx = NULL; 1194 PetscBool usecprow = a->compressedrow.use; 1195 1196 PetscFunctionBegin; 1197 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1198 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1199 1200 idx = a->j; 1201 v = a->a; 1202 if (usecprow) { 1203 if (zz != yy) { 1204 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1205 } 1206 mbs = a->compressedrow.nrows; 1207 ii = a->compressedrow.i; 1208 ridx = a->compressedrow.rindex; 1209 } else { 1210 ii = a->i; 1211 y = yarray; 1212 z = zarray; 1213 } 1214 1215 for (i=0; i<mbs; i++) { 1216 n = ii[1] - ii[0]; ii++; 1217 if (usecprow) { 1218 z = zarray + 3*ridx[i]; 1219 y = yarray + 3*ridx[i]; 1220 } 1221 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1222 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1223 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1224 for (j=0; j<n; j++) { 1225 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1226 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1227 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1228 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1229 v += 9; 1230 } 1231 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1232 if (!usecprow) { 1233 z += 3; y += 3; 1234 } 1235 } 1236 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1237 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1238 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1239 PetscFunctionReturn(0); 1240 } 1241 1242 #undef __FUNCT__ 1243 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1244 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1245 { 1246 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1247 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1248 const PetscScalar *x,*xb; 1249 const MatScalar *v; 1250 PetscErrorCode ierr; 1251 PetscInt mbs = a->mbs,i,j,n; 1252 const PetscInt *idx,*ii,*ridx=NULL; 1253 PetscBool usecprow=a->compressedrow.use; 1254 1255 PetscFunctionBegin; 1256 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1257 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1258 1259 idx = a->j; 1260 v = a->a; 1261 if (usecprow) { 1262 if (zz != yy) { 1263 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1264 } 1265 mbs = a->compressedrow.nrows; 1266 ii = a->compressedrow.i; 1267 ridx = a->compressedrow.rindex; 1268 } else { 1269 ii = a->i; 1270 y = yarray; 1271 z = zarray; 1272 } 1273 1274 for (i=0; i<mbs; i++) { 1275 n = ii[1] - ii[0]; ii++; 1276 if (usecprow) { 1277 z = zarray + 4*ridx[i]; 1278 y = yarray + 4*ridx[i]; 1279 } 1280 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1281 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1282 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1283 for (j=0; j<n; j++) { 1284 xb = x + 4*(*idx++); 1285 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1286 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1287 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1288 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1289 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1290 v += 16; 1291 } 1292 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1293 if (!usecprow) { 1294 z += 4; y += 4; 1295 } 1296 } 1297 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1298 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1299 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1300 PetscFunctionReturn(0); 1301 } 1302 1303 #undef __FUNCT__ 1304 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1305 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1306 { 1307 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1308 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1309 const PetscScalar *x,*xb; 1310 PetscScalar *yarray,*zarray; 1311 const MatScalar *v; 1312 PetscErrorCode ierr; 1313 PetscInt mbs = a->mbs,i,j,n; 1314 const PetscInt *idx,*ii,*ridx = NULL; 1315 PetscBool usecprow=a->compressedrow.use; 1316 1317 PetscFunctionBegin; 1318 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1319 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1320 1321 idx = a->j; 1322 v = a->a; 1323 if (usecprow) { 1324 if (zz != yy) { 1325 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1326 } 1327 mbs = a->compressedrow.nrows; 1328 ii = a->compressedrow.i; 1329 ridx = a->compressedrow.rindex; 1330 } else { 1331 ii = a->i; 1332 y = yarray; 1333 z = zarray; 1334 } 1335 1336 for (i=0; i<mbs; i++) { 1337 n = ii[1] - ii[0]; ii++; 1338 if (usecprow) { 1339 z = zarray + 5*ridx[i]; 1340 y = yarray + 5*ridx[i]; 1341 } 1342 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1343 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1344 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1345 for (j=0; j<n; j++) { 1346 xb = x + 5*(*idx++); 1347 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1348 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1349 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1350 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1351 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1352 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1353 v += 25; 1354 } 1355 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1356 if (!usecprow) { 1357 z += 5; y += 5; 1358 } 1359 } 1360 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1361 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1362 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1363 PetscFunctionReturn(0); 1364 } 1365 #undef __FUNCT__ 1366 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1367 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1368 { 1369 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1370 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1371 const PetscScalar *x,*xb; 1372 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1373 const MatScalar *v; 1374 PetscErrorCode ierr; 1375 PetscInt mbs = a->mbs,i,j,n; 1376 const PetscInt *idx,*ii,*ridx=NULL; 1377 PetscBool usecprow=a->compressedrow.use; 1378 1379 PetscFunctionBegin; 1380 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1381 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1382 1383 idx = a->j; 1384 v = a->a; 1385 if (usecprow) { 1386 if (zz != yy) { 1387 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1388 } 1389 mbs = a->compressedrow.nrows; 1390 ii = a->compressedrow.i; 1391 ridx = a->compressedrow.rindex; 1392 } else { 1393 ii = a->i; 1394 y = yarray; 1395 z = zarray; 1396 } 1397 1398 for (i=0; i<mbs; i++) { 1399 n = ii[1] - ii[0]; ii++; 1400 if (usecprow) { 1401 z = zarray + 6*ridx[i]; 1402 y = yarray + 6*ridx[i]; 1403 } 1404 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1405 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1406 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1407 for (j=0; j<n; j++) { 1408 xb = x + 6*(*idx++); 1409 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1410 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1411 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1412 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1413 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1414 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1415 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1416 v += 36; 1417 } 1418 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1419 if (!usecprow) { 1420 z += 6; y += 6; 1421 } 1422 } 1423 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1424 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1425 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 #undef __FUNCT__ 1430 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1431 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1432 { 1433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1434 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1435 const PetscScalar *x,*xb; 1436 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1437 const MatScalar *v; 1438 PetscErrorCode ierr; 1439 PetscInt mbs = a->mbs,i,j,n; 1440 const PetscInt *idx,*ii,*ridx = NULL; 1441 PetscBool usecprow=a->compressedrow.use; 1442 1443 PetscFunctionBegin; 1444 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1445 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1446 1447 idx = a->j; 1448 v = a->a; 1449 if (usecprow) { 1450 if (zz != yy) { 1451 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1452 } 1453 mbs = a->compressedrow.nrows; 1454 ii = a->compressedrow.i; 1455 ridx = a->compressedrow.rindex; 1456 } else { 1457 ii = a->i; 1458 y = yarray; 1459 z = zarray; 1460 } 1461 1462 for (i=0; i<mbs; i++) { 1463 n = ii[1] - ii[0]; ii++; 1464 if (usecprow) { 1465 z = zarray + 7*ridx[i]; 1466 y = yarray + 7*ridx[i]; 1467 } 1468 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1469 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1470 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1471 for (j=0; j<n; j++) { 1472 xb = x + 7*(*idx++); 1473 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1474 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1475 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1476 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1477 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1478 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1479 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1480 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1481 v += 49; 1482 } 1483 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1484 if (!usecprow) { 1485 z += 7; y += 7; 1486 } 1487 } 1488 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1489 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1490 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1491 PetscFunctionReturn(0); 1492 } 1493 1494 #undef __FUNCT__ 1495 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1496 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1497 { 1498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1499 PetscScalar *z = 0,*work,*workt,*zarray; 1500 const PetscScalar *x,*xb; 1501 const MatScalar *v; 1502 PetscErrorCode ierr; 1503 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1504 PetscInt ncols,k; 1505 const PetscInt *ridx = NULL,*idx,*ii; 1506 PetscBool usecprow = a->compressedrow.use; 1507 1508 PetscFunctionBegin; 1509 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1510 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1511 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1512 1513 idx = a->j; 1514 v = a->a; 1515 if (usecprow) { 1516 mbs = a->compressedrow.nrows; 1517 ii = a->compressedrow.i; 1518 ridx = a->compressedrow.rindex; 1519 } else { 1520 mbs = a->mbs; 1521 ii = a->i; 1522 z = zarray; 1523 } 1524 1525 if (!a->mult_work) { 1526 k = PetscMax(A->rmap->n,A->cmap->n); 1527 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1528 } 1529 work = a->mult_work; 1530 for (i=0; i<mbs; i++) { 1531 n = ii[1] - ii[0]; ii++; 1532 ncols = n*bs; 1533 workt = work; 1534 for (j=0; j<n; j++) { 1535 xb = x + bs*(*idx++); 1536 for (k=0; k<bs; k++) workt[k] = xb[k]; 1537 workt += bs; 1538 } 1539 if (usecprow) z = zarray + bs*ridx[i]; 1540 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1541 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1542 v += n*bs2; 1543 if (!usecprow) z += bs; 1544 } 1545 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1546 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1547 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 #undef __FUNCT__ 1552 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1553 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1554 { 1555 PetscScalar zero = 0.0; 1556 PetscErrorCode ierr; 1557 1558 PetscFunctionBegin; 1559 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1560 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1561 PetscFunctionReturn(0); 1562 } 1563 1564 #undef __FUNCT__ 1565 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1566 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1567 { 1568 PetscScalar zero = 0.0; 1569 PetscErrorCode ierr; 1570 1571 PetscFunctionBegin; 1572 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1573 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1574 PetscFunctionReturn(0); 1575 } 1576 1577 #undef __FUNCT__ 1578 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1579 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1580 { 1581 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1582 PetscScalar *z,x1,x2,x3,x4,x5; 1583 const PetscScalar *x,*xb = NULL; 1584 const MatScalar *v; 1585 PetscErrorCode ierr; 1586 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1587 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1588 Mat_CompressedRow cprow = a->compressedrow; 1589 PetscBool usecprow = cprow.use; 1590 1591 PetscFunctionBegin; 1592 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1593 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1594 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1595 1596 idx = a->j; 1597 v = a->a; 1598 if (usecprow) { 1599 mbs = cprow.nrows; 1600 ii = cprow.i; 1601 ridx = cprow.rindex; 1602 } else { 1603 mbs=a->mbs; 1604 ii = a->i; 1605 xb = x; 1606 } 1607 1608 switch (bs) { 1609 case 1: 1610 for (i=0; i<mbs; i++) { 1611 if (usecprow) xb = x + ridx[i]; 1612 x1 = xb[0]; 1613 ib = idx + ii[0]; 1614 n = ii[1] - ii[0]; ii++; 1615 for (j=0; j<n; j++) { 1616 rval = ib[j]; 1617 z[rval] += PetscConj(*v) * x1; 1618 v++; 1619 } 1620 if (!usecprow) xb++; 1621 } 1622 break; 1623 case 2: 1624 for (i=0; i<mbs; i++) { 1625 if (usecprow) xb = x + 2*ridx[i]; 1626 x1 = xb[0]; x2 = xb[1]; 1627 ib = idx + ii[0]; 1628 n = ii[1] - ii[0]; ii++; 1629 for (j=0; j<n; j++) { 1630 rval = ib[j]*2; 1631 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1632 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1633 v += 4; 1634 } 1635 if (!usecprow) xb += 2; 1636 } 1637 break; 1638 case 3: 1639 for (i=0; i<mbs; i++) { 1640 if (usecprow) xb = x + 3*ridx[i]; 1641 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1642 ib = idx + ii[0]; 1643 n = ii[1] - ii[0]; ii++; 1644 for (j=0; j<n; j++) { 1645 rval = ib[j]*3; 1646 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1647 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1648 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1649 v += 9; 1650 } 1651 if (!usecprow) xb += 3; 1652 } 1653 break; 1654 case 4: 1655 for (i=0; i<mbs; i++) { 1656 if (usecprow) xb = x + 4*ridx[i]; 1657 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1658 ib = idx + ii[0]; 1659 n = ii[1] - ii[0]; ii++; 1660 for (j=0; j<n; j++) { 1661 rval = ib[j]*4; 1662 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1663 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1664 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1665 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1666 v += 16; 1667 } 1668 if (!usecprow) xb += 4; 1669 } 1670 break; 1671 case 5: 1672 for (i=0; i<mbs; i++) { 1673 if (usecprow) xb = x + 5*ridx[i]; 1674 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1675 x4 = xb[3]; x5 = xb[4]; 1676 ib = idx + ii[0]; 1677 n = ii[1] - ii[0]; ii++; 1678 for (j=0; j<n; j++) { 1679 rval = ib[j]*5; 1680 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1681 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1682 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1683 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1684 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1685 v += 25; 1686 } 1687 if (!usecprow) xb += 5; 1688 } 1689 break; 1690 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1691 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1692 #if 0 1693 { 1694 PetscInt ncols,k,bs2=a->bs2; 1695 PetscScalar *work,*workt,zb; 1696 const PetscScalar *xtmp; 1697 if (!a->mult_work) { 1698 k = PetscMax(A->rmap->n,A->cmap->n); 1699 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1700 } 1701 work = a->mult_work; 1702 xtmp = x; 1703 for (i=0; i<mbs; i++) { 1704 n = ii[1] - ii[0]; ii++; 1705 ncols = n*bs; 1706 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1707 if (usecprow) xtmp = x + bs*ridx[i]; 1708 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1709 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1710 v += n*bs2; 1711 if (!usecprow) xtmp += bs; 1712 workt = work; 1713 for (j=0; j<n; j++) { 1714 zb = z + bs*(*idx++); 1715 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1716 workt += bs; 1717 } 1718 } 1719 } 1720 #endif 1721 } 1722 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1723 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1724 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1725 PetscFunctionReturn(0); 1726 } 1727 1728 #undef __FUNCT__ 1729 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1730 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1731 { 1732 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1733 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1734 const PetscScalar *x,*xb = 0; 1735 const MatScalar *v; 1736 PetscErrorCode ierr; 1737 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1738 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1739 Mat_CompressedRow cprow = a->compressedrow; 1740 PetscBool usecprow=cprow.use; 1741 1742 PetscFunctionBegin; 1743 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1744 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1745 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1746 1747 idx = a->j; 1748 v = a->a; 1749 if (usecprow) { 1750 mbs = cprow.nrows; 1751 ii = cprow.i; 1752 ridx = cprow.rindex; 1753 } else { 1754 mbs=a->mbs; 1755 ii = a->i; 1756 xb = x; 1757 } 1758 1759 switch (bs) { 1760 case 1: 1761 for (i=0; i<mbs; i++) { 1762 if (usecprow) xb = x + ridx[i]; 1763 x1 = xb[0]; 1764 ib = idx + ii[0]; 1765 n = ii[1] - ii[0]; ii++; 1766 for (j=0; j<n; j++) { 1767 rval = ib[j]; 1768 z[rval] += *v * x1; 1769 v++; 1770 } 1771 if (!usecprow) xb++; 1772 } 1773 break; 1774 case 2: 1775 for (i=0; i<mbs; i++) { 1776 if (usecprow) xb = x + 2*ridx[i]; 1777 x1 = xb[0]; x2 = xb[1]; 1778 ib = idx + ii[0]; 1779 n = ii[1] - ii[0]; ii++; 1780 for (j=0; j<n; j++) { 1781 rval = ib[j]*2; 1782 z[rval++] += v[0]*x1 + v[1]*x2; 1783 z[rval++] += v[2]*x1 + v[3]*x2; 1784 v += 4; 1785 } 1786 if (!usecprow) xb += 2; 1787 } 1788 break; 1789 case 3: 1790 for (i=0; i<mbs; i++) { 1791 if (usecprow) xb = x + 3*ridx[i]; 1792 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1793 ib = idx + ii[0]; 1794 n = ii[1] - ii[0]; ii++; 1795 for (j=0; j<n; j++) { 1796 rval = ib[j]*3; 1797 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1798 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1799 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1800 v += 9; 1801 } 1802 if (!usecprow) xb += 3; 1803 } 1804 break; 1805 case 4: 1806 for (i=0; i<mbs; i++) { 1807 if (usecprow) xb = x + 4*ridx[i]; 1808 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1809 ib = idx + ii[0]; 1810 n = ii[1] - ii[0]; ii++; 1811 for (j=0; j<n; j++) { 1812 rval = ib[j]*4; 1813 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1814 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1815 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1816 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1817 v += 16; 1818 } 1819 if (!usecprow) xb += 4; 1820 } 1821 break; 1822 case 5: 1823 for (i=0; i<mbs; i++) { 1824 if (usecprow) xb = x + 5*ridx[i]; 1825 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1826 x4 = xb[3]; x5 = xb[4]; 1827 ib = idx + ii[0]; 1828 n = ii[1] - ii[0]; ii++; 1829 for (j=0; j<n; j++) { 1830 rval = ib[j]*5; 1831 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1832 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1833 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1834 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1835 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1836 v += 25; 1837 } 1838 if (!usecprow) xb += 5; 1839 } 1840 break; 1841 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1842 PetscInt ncols,k; 1843 PetscScalar *work,*workt; 1844 const PetscScalar *xtmp; 1845 if (!a->mult_work) { 1846 k = PetscMax(A->rmap->n,A->cmap->n); 1847 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1848 } 1849 work = a->mult_work; 1850 xtmp = x; 1851 for (i=0; i<mbs; i++) { 1852 n = ii[1] - ii[0]; ii++; 1853 ncols = n*bs; 1854 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1855 if (usecprow) xtmp = x + bs*ridx[i]; 1856 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1857 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1858 v += n*bs2; 1859 if (!usecprow) xtmp += bs; 1860 workt = work; 1861 for (j=0; j<n; j++) { 1862 zb = z + bs*(*idx++); 1863 for (k=0; k<bs; k++) zb[k] += workt[k]; 1864 workt += bs; 1865 } 1866 } 1867 } 1868 } 1869 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1870 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1871 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1872 PetscFunctionReturn(0); 1873 } 1874 1875 #undef __FUNCT__ 1876 #define __FUNCT__ "MatScale_SeqBAIJ" 1877 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1878 { 1879 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1880 PetscInt totalnz = a->bs2*a->nz; 1881 PetscScalar oalpha = alpha; 1882 PetscErrorCode ierr; 1883 PetscBLASInt one = 1,tnz; 1884 1885 PetscFunctionBegin; 1886 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1887 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1888 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1889 PetscFunctionReturn(0); 1890 } 1891 1892 #undef __FUNCT__ 1893 #define __FUNCT__ "MatNorm_SeqBAIJ" 1894 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1895 { 1896 PetscErrorCode ierr; 1897 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1898 MatScalar *v = a->a; 1899 PetscReal sum = 0.0; 1900 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1901 1902 PetscFunctionBegin; 1903 if (type == NORM_FROBENIUS) { 1904 for (i=0; i< bs2*nz; i++) { 1905 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1906 } 1907 *norm = PetscSqrtReal(sum); 1908 } else if (type == NORM_1) { /* maximum column sum */ 1909 PetscReal *tmp; 1910 PetscInt *bcol = a->j; 1911 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 1912 for (i=0; i<nz; i++) { 1913 for (j=0; j<bs; j++) { 1914 k1 = bs*(*bcol) + j; /* column index */ 1915 for (k=0; k<bs; k++) { 1916 tmp[k1] += PetscAbsScalar(*v); v++; 1917 } 1918 } 1919 bcol++; 1920 } 1921 *norm = 0.0; 1922 for (j=0; j<A->cmap->n; j++) { 1923 if (tmp[j] > *norm) *norm = tmp[j]; 1924 } 1925 ierr = PetscFree(tmp);CHKERRQ(ierr); 1926 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1927 *norm = 0.0; 1928 for (k=0; k<bs; k++) { 1929 for (j=0; j<a->mbs; j++) { 1930 v = a->a + bs2*a->i[j] + k; 1931 sum = 0.0; 1932 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1933 for (k1=0; k1<bs; k1++) { 1934 sum += PetscAbsScalar(*v); 1935 v += bs; 1936 } 1937 } 1938 if (sum > *norm) *norm = sum; 1939 } 1940 } 1941 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1942 PetscFunctionReturn(0); 1943 } 1944 1945 1946 #undef __FUNCT__ 1947 #define __FUNCT__ "MatEqual_SeqBAIJ" 1948 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1949 { 1950 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1951 PetscErrorCode ierr; 1952 1953 PetscFunctionBegin; 1954 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1955 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1956 *flg = PETSC_FALSE; 1957 PetscFunctionReturn(0); 1958 } 1959 1960 /* if the a->i are the same */ 1961 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1962 if (!*flg) PetscFunctionReturn(0); 1963 1964 /* if a->j are the same */ 1965 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1966 if (!*flg) PetscFunctionReturn(0); 1967 1968 /* if a->a are the same */ 1969 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1970 PetscFunctionReturn(0); 1971 1972 } 1973 1974 #undef __FUNCT__ 1975 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1976 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1977 { 1978 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1979 PetscErrorCode ierr; 1980 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1981 PetscScalar *x,zero = 0.0; 1982 MatScalar *aa,*aa_j; 1983 1984 PetscFunctionBegin; 1985 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1986 bs = A->rmap->bs; 1987 aa = a->a; 1988 ai = a->i; 1989 aj = a->j; 1990 ambs = a->mbs; 1991 bs2 = a->bs2; 1992 1993 ierr = VecSet(v,zero);CHKERRQ(ierr); 1994 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1995 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1996 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1997 for (i=0; i<ambs; i++) { 1998 for (j=ai[i]; j<ai[i+1]; j++) { 1999 if (aj[j] == i) { 2000 row = i*bs; 2001 aa_j = aa+j*bs2; 2002 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2003 break; 2004 } 2005 } 2006 } 2007 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2008 PetscFunctionReturn(0); 2009 } 2010 2011 #undef __FUNCT__ 2012 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2013 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2014 { 2015 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2016 const PetscScalar *l,*r,*li,*ri; 2017 PetscScalar x; 2018 MatScalar *aa, *v; 2019 PetscErrorCode ierr; 2020 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2021 const PetscInt *ai,*aj; 2022 2023 PetscFunctionBegin; 2024 ai = a->i; 2025 aj = a->j; 2026 aa = a->a; 2027 m = A->rmap->n; 2028 n = A->cmap->n; 2029 bs = A->rmap->bs; 2030 mbs = a->mbs; 2031 bs2 = a->bs2; 2032 if (ll) { 2033 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2034 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2035 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2036 for (i=0; i<mbs; i++) { /* for each block row */ 2037 M = ai[i+1] - ai[i]; 2038 li = l + i*bs; 2039 v = aa + bs2*ai[i]; 2040 for (j=0; j<M; j++) { /* for each block */ 2041 for (k=0; k<bs2; k++) { 2042 (*v++) *= li[k%bs]; 2043 } 2044 } 2045 } 2046 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2047 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2048 } 2049 2050 if (rr) { 2051 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2052 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2053 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2054 for (i=0; i<mbs; i++) { /* for each block row */ 2055 iai = ai[i]; 2056 M = ai[i+1] - iai; 2057 v = aa + bs2*iai; 2058 for (j=0; j<M; j++) { /* for each block */ 2059 ri = r + bs*aj[iai+j]; 2060 for (k=0; k<bs; k++) { 2061 x = ri[k]; 2062 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2063 v += bs; 2064 } 2065 } 2066 } 2067 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2068 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2069 } 2070 PetscFunctionReturn(0); 2071 } 2072 2073 2074 #undef __FUNCT__ 2075 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2076 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2077 { 2078 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2079 2080 PetscFunctionBegin; 2081 info->block_size = a->bs2; 2082 info->nz_allocated = a->bs2*a->maxnz; 2083 info->nz_used = a->bs2*a->nz; 2084 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2085 info->assemblies = A->num_ass; 2086 info->mallocs = A->info.mallocs; 2087 info->memory = ((PetscObject)A)->mem; 2088 if (A->factortype) { 2089 info->fill_ratio_given = A->info.fill_ratio_given; 2090 info->fill_ratio_needed = A->info.fill_ratio_needed; 2091 info->factor_mallocs = A->info.factor_mallocs; 2092 } else { 2093 info->fill_ratio_given = 0; 2094 info->fill_ratio_needed = 0; 2095 info->factor_mallocs = 0; 2096 } 2097 PetscFunctionReturn(0); 2098 } 2099 2100 #undef __FUNCT__ 2101 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2102 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2103 { 2104 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2105 PetscErrorCode ierr; 2106 2107 PetscFunctionBegin; 2108 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2109 PetscFunctionReturn(0); 2110 } 2111