1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 8 { 9 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 10 PetscErrorCode ierr; 11 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 12 const PetscInt *idx; 13 PetscInt start,end,*ai,*aj,bs,*nidx2; 14 PetscBT table; 15 16 PetscFunctionBegin; 17 m = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 22 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 23 24 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 25 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 28 for (i=0; i<is_max; i++) { 29 /* Initialise the two local arrays */ 30 isz = 0; 31 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 38 for (j=0; j<n; ++j) { 39 ival = idx[j]/bs; /* convert the indices into block indices */ 40 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 42 } 43 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 44 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 45 46 k = 0; 47 for (j=0; j<ov; j++) { /* for each overlap*/ 48 n = isz; 49 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 50 row = nidx[k]; 51 start = ai[row]; 52 end = ai[row+1]; 53 for (l = start; l<end; l++) { 54 val = aj[l]; 55 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 56 } 57 } 58 } 59 /* expand the Index Set */ 60 for (j=0; j<isz; j++) { 61 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 62 } 63 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 64 } 65 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 66 ierr = PetscFree(nidx);CHKERRQ(ierr); 67 ierr = PetscFree(nidx2);CHKERRQ(ierr); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 72 { 73 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 74 PetscErrorCode ierr; 75 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 76 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 77 const PetscInt *irow,*icol; 78 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 79 PetscInt *aj = a->j,*ai = a->i; 80 MatScalar *mat_a; 81 Mat C; 82 PetscBool flag; 83 84 PetscFunctionBegin; 85 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 86 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 87 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 88 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 89 90 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 91 ssmap = smap; 92 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 93 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 94 /* determine lens of each row */ 95 for (i=0; i<nrows; i++) { 96 kstart = ai[irow[i]]; 97 kend = kstart + a->ilen[irow[i]]; 98 lens[i] = 0; 99 for (k=kstart; k<kend; k++) { 100 if (ssmap[aj[k]]) lens[i]++; 101 } 102 } 103 /* Create and fill new matrix */ 104 if (scall == MAT_REUSE_MATRIX) { 105 c = (Mat_SeqBAIJ*)((*B)->data); 106 107 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 108 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 109 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 110 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 111 C = *B; 112 } else { 113 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 114 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 115 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 116 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 117 } 118 c = (Mat_SeqBAIJ*)(C->data); 119 for (i=0; i<nrows; i++) { 120 row = irow[i]; 121 kstart = ai[row]; 122 kend = kstart + a->ilen[row]; 123 mat_i = c->i[i]; 124 mat_j = c->j + mat_i; 125 mat_a = c->a + mat_i*bs2; 126 mat_ilen = c->ilen + i; 127 for (k=kstart; k<kend; k++) { 128 if ((tcol=ssmap[a->j[k]])) { 129 *mat_j++ = tcol - 1; 130 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 131 mat_a += bs2; 132 (*mat_ilen)++; 133 } 134 } 135 } 136 /* sort */ 137 { 138 MatScalar *work; 139 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 140 for (i=0; i<nrows; i++) { 141 PetscInt ilen; 142 mat_i = c->i[i]; 143 mat_j = c->j + mat_i; 144 mat_a = c->a + mat_i*bs2; 145 ilen = c->ilen[i]; 146 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 147 } 148 ierr = PetscFree(work);CHKERRQ(ierr); 149 } 150 151 /* Free work space */ 152 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 153 ierr = PetscFree(smap);CHKERRQ(ierr); 154 ierr = PetscFree(lens);CHKERRQ(ierr); 155 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 157 158 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 maxmnbs = PetscMax(a->mbs,a->nbs); 180 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 } 186 count = 0; 187 for (i=0; i<nrows; i++) { 188 j = irow[i] / bs; 189 if ((vary[j]--)==bs) iary[count++] = j; 190 } 191 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 192 193 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 194 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 195 for (i=0; i<a->nbs; i++) { 196 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 197 } 198 count = 0; 199 for (i=0; i<ncols; i++) { 200 j = icol[i] / bs; 201 if ((vary[j]--)==bs) iary[count++] = j; 202 } 203 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 206 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 207 208 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 209 ierr = ISDestroy(&is1);CHKERRQ(ierr); 210 ierr = ISDestroy(&is2);CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatDestroy_SeqBAIJ_Submatrices(Mat C) 215 { 216 PetscErrorCode ierr; 217 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 218 Mat_SubMat *submatj = c->submatis1; 219 220 PetscFunctionBegin; 221 ierr = submatj->destroy(C);CHKERRQ(ierr); 222 ierr = MatDestroySubMatrices_Private(submatj);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 227 { 228 PetscErrorCode ierr; 229 PetscInt i; 230 231 PetscFunctionBegin; 232 if (scall == MAT_INITIAL_MATRIX) { 233 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 234 } 235 236 for (i=0; i<n; i++) { 237 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 238 } 239 PetscFunctionReturn(0); 240 } 241 242 243 /* -------------------------------------------------------*/ 244 /* Should check that shapes of vectors and matrices match */ 245 /* -------------------------------------------------------*/ 246 247 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 248 { 249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 250 PetscScalar *z,sum; 251 const PetscScalar *x; 252 const MatScalar *v; 253 PetscErrorCode ierr; 254 PetscInt mbs,i,n; 255 const PetscInt *idx,*ii,*ridx=NULL; 256 PetscBool usecprow=a->compressedrow.use; 257 258 PetscFunctionBegin; 259 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 260 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 261 262 if (usecprow) { 263 mbs = a->compressedrow.nrows; 264 ii = a->compressedrow.i; 265 ridx = a->compressedrow.rindex; 266 ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 267 } else { 268 mbs = a->mbs; 269 ii = a->i; 270 } 271 272 for (i=0; i<mbs; i++) { 273 n = ii[1] - ii[0]; 274 v = a->a + ii[0]; 275 idx = a->j + ii[0]; 276 ii++; 277 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 278 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 279 sum = 0.0; 280 PetscSparseDensePlusDot(sum,x,v,idx,n); 281 if (usecprow) { 282 z[ridx[i]] = sum; 283 } else { 284 z[i] = sum; 285 } 286 } 287 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 288 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 289 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 290 PetscFunctionReturn(0); 291 } 292 293 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 294 { 295 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 296 PetscScalar *z = 0,sum1,sum2,*zarray; 297 const PetscScalar *x,*xb; 298 PetscScalar x1,x2; 299 const MatScalar *v; 300 PetscErrorCode ierr; 301 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 302 PetscBool usecprow=a->compressedrow.use; 303 304 PetscFunctionBegin; 305 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 306 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 307 308 idx = a->j; 309 v = a->a; 310 if (usecprow) { 311 mbs = a->compressedrow.nrows; 312 ii = a->compressedrow.i; 313 ridx = a->compressedrow.rindex; 314 ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 315 } else { 316 mbs = a->mbs; 317 ii = a->i; 318 z = zarray; 319 } 320 321 for (i=0; i<mbs; i++) { 322 n = ii[1] - ii[0]; ii++; 323 sum1 = 0.0; sum2 = 0.0; 324 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 325 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 326 for (j=0; j<n; j++) { 327 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 328 sum1 += v[0]*x1 + v[2]*x2; 329 sum2 += v[1]*x1 + v[3]*x2; 330 v += 4; 331 } 332 if (usecprow) z = zarray + 2*ridx[i]; 333 z[0] = sum1; z[1] = sum2; 334 if (!usecprow) z += 2; 335 } 336 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 337 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 338 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 339 PetscFunctionReturn(0); 340 } 341 342 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 343 { 344 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 345 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 346 const PetscScalar *x,*xb; 347 const MatScalar *v; 348 PetscErrorCode ierr; 349 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 350 PetscBool usecprow=a->compressedrow.use; 351 352 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 353 #pragma disjoint(*v,*z,*xb) 354 #endif 355 356 PetscFunctionBegin; 357 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 358 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 359 360 idx = a->j; 361 v = a->a; 362 if (usecprow) { 363 mbs = a->compressedrow.nrows; 364 ii = a->compressedrow.i; 365 ridx = a->compressedrow.rindex; 366 ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 367 } else { 368 mbs = a->mbs; 369 ii = a->i; 370 z = zarray; 371 } 372 373 for (i=0; i<mbs; i++) { 374 n = ii[1] - ii[0]; ii++; 375 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 376 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 377 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 378 for (j=0; j<n; j++) { 379 xb = x + 3*(*idx++); 380 x1 = xb[0]; 381 x2 = xb[1]; 382 x3 = xb[2]; 383 384 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 385 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 386 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 387 v += 9; 388 } 389 if (usecprow) z = zarray + 3*ridx[i]; 390 z[0] = sum1; z[1] = sum2; z[2] = sum3; 391 if (!usecprow) z += 3; 392 } 393 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 394 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 395 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 396 PetscFunctionReturn(0); 397 } 398 399 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 400 { 401 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 402 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 403 const PetscScalar *x,*xb; 404 const MatScalar *v; 405 PetscErrorCode ierr; 406 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 407 PetscBool usecprow=a->compressedrow.use; 408 409 PetscFunctionBegin; 410 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 411 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 412 413 idx = a->j; 414 v = a->a; 415 if (usecprow) { 416 mbs = a->compressedrow.nrows; 417 ii = a->compressedrow.i; 418 ridx = a->compressedrow.rindex; 419 ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 420 } else { 421 mbs = a->mbs; 422 ii = a->i; 423 z = zarray; 424 } 425 426 for (i=0; i<mbs; i++) { 427 n = ii[1] - ii[0]; 428 ii++; 429 sum1 = 0.0; 430 sum2 = 0.0; 431 sum3 = 0.0; 432 sum4 = 0.0; 433 434 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 435 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 436 for (j=0; j<n; j++) { 437 xb = x + 4*(*idx++); 438 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 439 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 440 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 441 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 442 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 443 v += 16; 444 } 445 if (usecprow) z = zarray + 4*ridx[i]; 446 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 447 if (!usecprow) z += 4; 448 } 449 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 450 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 451 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 452 PetscFunctionReturn(0); 453 } 454 455 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 456 { 457 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 458 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 459 const PetscScalar *xb,*x; 460 const MatScalar *v; 461 PetscErrorCode ierr; 462 const PetscInt *idx,*ii,*ridx=NULL; 463 PetscInt mbs,i,j,n; 464 PetscBool usecprow=a->compressedrow.use; 465 466 PetscFunctionBegin; 467 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 468 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 469 470 idx = a->j; 471 v = a->a; 472 if (usecprow) { 473 mbs = a->compressedrow.nrows; 474 ii = a->compressedrow.i; 475 ridx = a->compressedrow.rindex; 476 ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 477 } else { 478 mbs = a->mbs; 479 ii = a->i; 480 z = zarray; 481 } 482 483 for (i=0; i<mbs; i++) { 484 n = ii[1] - ii[0]; ii++; 485 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 486 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 487 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 488 for (j=0; j<n; j++) { 489 xb = x + 5*(*idx++); 490 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 491 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 492 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 493 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 494 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 495 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 496 v += 25; 497 } 498 if (usecprow) z = zarray + 5*ridx[i]; 499 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 500 if (!usecprow) z += 5; 501 } 502 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 503 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 504 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 505 PetscFunctionReturn(0); 506 } 507 508 509 510 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 511 { 512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 513 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 514 const PetscScalar *x,*xb; 515 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 516 const MatScalar *v; 517 PetscErrorCode ierr; 518 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 519 PetscBool usecprow=a->compressedrow.use; 520 521 PetscFunctionBegin; 522 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 523 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 524 525 idx = a->j; 526 v = a->a; 527 if (usecprow) { 528 mbs = a->compressedrow.nrows; 529 ii = a->compressedrow.i; 530 ridx = a->compressedrow.rindex; 531 ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 532 } else { 533 mbs = a->mbs; 534 ii = a->i; 535 z = zarray; 536 } 537 538 for (i=0; i<mbs; i++) { 539 n = ii[1] - ii[0]; 540 ii++; 541 sum1 = 0.0; 542 sum2 = 0.0; 543 sum3 = 0.0; 544 sum4 = 0.0; 545 sum5 = 0.0; 546 sum6 = 0.0; 547 548 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 549 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 550 for (j=0; j<n; j++) { 551 xb = x + 6*(*idx++); 552 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 553 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 554 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 555 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 556 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 557 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 558 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 559 v += 36; 560 } 561 if (usecprow) z = zarray + 6*ridx[i]; 562 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 563 if (!usecprow) z += 6; 564 } 565 566 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 567 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 568 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 569 PetscFunctionReturn(0); 570 } 571 572 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 573 { 574 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 575 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 576 const PetscScalar *x,*xb; 577 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 578 const MatScalar *v; 579 PetscErrorCode ierr; 580 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 581 PetscBool usecprow=a->compressedrow.use; 582 583 PetscFunctionBegin; 584 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 585 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 586 587 idx = a->j; 588 v = a->a; 589 if (usecprow) { 590 mbs = a->compressedrow.nrows; 591 ii = a->compressedrow.i; 592 ridx = a->compressedrow.rindex; 593 ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 594 } else { 595 mbs = a->mbs; 596 ii = a->i; 597 z = zarray; 598 } 599 600 for (i=0; i<mbs; i++) { 601 n = ii[1] - ii[0]; 602 ii++; 603 sum1 = 0.0; 604 sum2 = 0.0; 605 sum3 = 0.0; 606 sum4 = 0.0; 607 sum5 = 0.0; 608 sum6 = 0.0; 609 sum7 = 0.0; 610 611 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 612 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 613 for (j=0; j<n; j++) { 614 xb = x + 7*(*idx++); 615 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 616 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 617 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 618 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 619 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 620 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 621 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 622 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 623 v += 49; 624 } 625 if (usecprow) z = zarray + 7*ridx[i]; 626 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 627 if (!usecprow) z += 7; 628 } 629 630 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 631 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 632 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 633 PetscFunctionReturn(0); 634 } 635 636 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 637 /* Default MatMult for block size 15 */ 638 639 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 640 { 641 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 642 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 643 const PetscScalar *x,*xb; 644 PetscScalar *zarray,xv; 645 const MatScalar *v; 646 PetscErrorCode ierr; 647 const PetscInt *ii,*ij=a->j,*idx; 648 PetscInt mbs,i,j,k,n,*ridx=NULL; 649 PetscBool usecprow=a->compressedrow.use; 650 651 PetscFunctionBegin; 652 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 653 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 654 655 v = a->a; 656 if (usecprow) { 657 mbs = a->compressedrow.nrows; 658 ii = a->compressedrow.i; 659 ridx = a->compressedrow.rindex; 660 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 661 } else { 662 mbs = a->mbs; 663 ii = a->i; 664 z = zarray; 665 } 666 667 for (i=0; i<mbs; i++) { 668 n = ii[i+1] - ii[i]; 669 idx = ij + ii[i]; 670 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 671 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 672 673 for (j=0; j<n; j++) { 674 xb = x + 15*(idx[j]); 675 676 for (k=0; k<15; k++) { 677 xv = xb[k]; 678 sum1 += v[0]*xv; 679 sum2 += v[1]*xv; 680 sum3 += v[2]*xv; 681 sum4 += v[3]*xv; 682 sum5 += v[4]*xv; 683 sum6 += v[5]*xv; 684 sum7 += v[6]*xv; 685 sum8 += v[7]*xv; 686 sum9 += v[8]*xv; 687 sum10 += v[9]*xv; 688 sum11 += v[10]*xv; 689 sum12 += v[11]*xv; 690 sum13 += v[12]*xv; 691 sum14 += v[13]*xv; 692 sum15 += v[14]*xv; 693 v += 15; 694 } 695 } 696 if (usecprow) z = zarray + 15*ridx[i]; 697 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 698 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 699 700 if (!usecprow) z += 15; 701 } 702 703 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 704 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 705 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 706 PetscFunctionReturn(0); 707 } 708 709 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 710 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 711 { 712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 713 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 714 const PetscScalar *x,*xb; 715 PetscScalar x1,x2,x3,x4,*zarray; 716 const MatScalar *v; 717 PetscErrorCode ierr; 718 const PetscInt *ii,*ij=a->j,*idx; 719 PetscInt mbs,i,j,n,*ridx=NULL; 720 PetscBool usecprow=a->compressedrow.use; 721 722 PetscFunctionBegin; 723 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 724 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 725 726 v = a->a; 727 if (usecprow) { 728 mbs = a->compressedrow.nrows; 729 ii = a->compressedrow.i; 730 ridx = a->compressedrow.rindex; 731 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 732 } else { 733 mbs = a->mbs; 734 ii = a->i; 735 z = zarray; 736 } 737 738 for (i=0; i<mbs; i++) { 739 n = ii[i+1] - ii[i]; 740 idx = ij + ii[i]; 741 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 742 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 743 744 for (j=0; j<n; j++) { 745 xb = x + 15*(idx[j]); 746 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 747 748 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 749 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 750 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 751 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 752 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 753 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 754 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 755 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 756 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 757 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 758 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 759 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 760 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 761 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 762 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 763 764 v += 60; 765 766 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 767 768 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 769 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 770 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 771 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 772 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 773 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 774 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 775 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 776 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 777 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 778 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 779 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 780 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 781 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 782 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 783 v += 60; 784 785 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 786 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 787 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 788 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 789 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 790 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 791 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 792 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 793 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 794 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 795 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 796 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 797 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 798 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 799 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 800 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 801 v += 60; 802 803 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 804 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 805 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 806 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 807 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 808 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 809 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 810 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 811 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 812 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 813 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 814 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 815 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 816 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 817 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 818 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 819 v += 45; 820 } 821 if (usecprow) z = zarray + 15*ridx[i]; 822 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 823 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 824 825 if (!usecprow) z += 15; 826 } 827 828 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 829 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 830 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 831 PetscFunctionReturn(0); 832 } 833 834 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 835 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 836 { 837 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 838 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 839 const PetscScalar *x,*xb; 840 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 841 const MatScalar *v; 842 PetscErrorCode ierr; 843 const PetscInt *ii,*ij=a->j,*idx; 844 PetscInt mbs,i,j,n,*ridx=NULL; 845 PetscBool usecprow=a->compressedrow.use; 846 847 PetscFunctionBegin; 848 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 849 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 850 851 v = a->a; 852 if (usecprow) { 853 mbs = a->compressedrow.nrows; 854 ii = a->compressedrow.i; 855 ridx = a->compressedrow.rindex; 856 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 857 } else { 858 mbs = a->mbs; 859 ii = a->i; 860 z = zarray; 861 } 862 863 for (i=0; i<mbs; i++) { 864 n = ii[i+1] - ii[i]; 865 idx = ij + ii[i]; 866 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 867 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 868 869 for (j=0; j<n; j++) { 870 xb = x + 15*(idx[j]); 871 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 872 x8 = xb[7]; 873 874 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 875 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 876 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 877 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 878 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 879 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 880 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 881 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 882 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 883 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 884 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 885 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 886 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 887 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 888 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 889 v += 120; 890 891 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 892 893 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 894 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 895 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 896 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 897 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 898 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 899 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 900 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 901 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 902 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 903 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 904 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 905 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 906 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 907 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 908 v += 105; 909 } 910 if (usecprow) z = zarray + 15*ridx[i]; 911 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 912 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 913 914 if (!usecprow) z += 15; 915 } 916 917 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 918 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 919 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 920 PetscFunctionReturn(0); 921 } 922 923 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 924 925 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 926 { 927 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 928 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 929 const PetscScalar *x,*xb; 930 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 931 const MatScalar *v; 932 PetscErrorCode ierr; 933 const PetscInt *ii,*ij=a->j,*idx; 934 PetscInt mbs,i,j,n,*ridx=NULL; 935 PetscBool usecprow=a->compressedrow.use; 936 937 PetscFunctionBegin; 938 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 939 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 940 941 v = a->a; 942 if (usecprow) { 943 mbs = a->compressedrow.nrows; 944 ii = a->compressedrow.i; 945 ridx = a->compressedrow.rindex; 946 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 947 } else { 948 mbs = a->mbs; 949 ii = a->i; 950 z = zarray; 951 } 952 953 for (i=0; i<mbs; i++) { 954 n = ii[i+1] - ii[i]; 955 idx = ij + ii[i]; 956 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 957 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 958 959 for (j=0; j<n; j++) { 960 xb = x + 15*(idx[j]); 961 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 962 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 963 964 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 965 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 966 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 967 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 968 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 969 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 970 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 971 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 972 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 973 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 974 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 975 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 976 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 977 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 978 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 979 v += 225; 980 } 981 if (usecprow) z = zarray + 15*ridx[i]; 982 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 983 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 984 985 if (!usecprow) z += 15; 986 } 987 988 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 989 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 990 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 991 PetscFunctionReturn(0); 992 } 993 994 995 /* 996 This will not work with MatScalar == float because it calls the BLAS 997 */ 998 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 999 { 1000 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1001 PetscScalar *z = 0,*work,*workt,*zarray; 1002 const PetscScalar *x,*xb; 1003 const MatScalar *v; 1004 PetscErrorCode ierr; 1005 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1006 const PetscInt *idx,*ii,*ridx=NULL; 1007 PetscInt ncols,k; 1008 PetscBool usecprow=a->compressedrow.use; 1009 1010 PetscFunctionBegin; 1011 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1012 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1013 1014 idx = a->j; 1015 v = a->a; 1016 if (usecprow) { 1017 mbs = a->compressedrow.nrows; 1018 ii = a->compressedrow.i; 1019 ridx = a->compressedrow.rindex; 1020 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1021 } else { 1022 mbs = a->mbs; 1023 ii = a->i; 1024 z = zarray; 1025 } 1026 1027 if (!a->mult_work) { 1028 k = PetscMax(A->rmap->n,A->cmap->n); 1029 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1030 } 1031 work = a->mult_work; 1032 for (i=0; i<mbs; i++) { 1033 n = ii[1] - ii[0]; ii++; 1034 ncols = n*bs; 1035 workt = work; 1036 for (j=0; j<n; j++) { 1037 xb = x + bs*(*idx++); 1038 for (k=0; k<bs; k++) workt[k] = xb[k]; 1039 workt += bs; 1040 } 1041 if (usecprow) z = zarray + bs*ridx[i]; 1042 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1043 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1044 v += n*bs2; 1045 if (!usecprow) z += bs; 1046 } 1047 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1048 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1049 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 1053 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1054 { 1055 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1056 const PetscScalar *x; 1057 PetscScalar *y,*z,sum; 1058 const MatScalar *v; 1059 PetscErrorCode ierr; 1060 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1061 const PetscInt *idx,*ii; 1062 PetscBool usecprow=a->compressedrow.use; 1063 1064 PetscFunctionBegin; 1065 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1066 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1067 1068 idx = a->j; 1069 v = a->a; 1070 if (usecprow) { 1071 if (zz != yy) { 1072 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1073 } 1074 mbs = a->compressedrow.nrows; 1075 ii = a->compressedrow.i; 1076 ridx = a->compressedrow.rindex; 1077 } else { 1078 ii = a->i; 1079 } 1080 1081 for (i=0; i<mbs; i++) { 1082 n = ii[1] - ii[0]; 1083 ii++; 1084 if (!usecprow) { 1085 sum = y[i]; 1086 } else { 1087 sum = y[ridx[i]]; 1088 } 1089 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1090 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1091 PetscSparseDensePlusDot(sum,x,v,idx,n); 1092 v += n; 1093 idx += n; 1094 if (usecprow) { 1095 z[ridx[i]] = sum; 1096 } else { 1097 z[i] = sum; 1098 } 1099 } 1100 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1101 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1102 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1107 { 1108 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1109 PetscScalar *y = 0,*z = 0,sum1,sum2; 1110 const PetscScalar *x,*xb; 1111 PetscScalar x1,x2,*yarray,*zarray; 1112 const MatScalar *v; 1113 PetscErrorCode ierr; 1114 PetscInt mbs = a->mbs,i,n,j; 1115 const PetscInt *idx,*ii,*ridx = NULL; 1116 PetscBool usecprow = a->compressedrow.use; 1117 1118 PetscFunctionBegin; 1119 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1120 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1121 1122 idx = a->j; 1123 v = a->a; 1124 if (usecprow) { 1125 if (zz != yy) { 1126 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1127 } 1128 mbs = a->compressedrow.nrows; 1129 ii = a->compressedrow.i; 1130 ridx = a->compressedrow.rindex; 1131 } else { 1132 ii = a->i; 1133 y = yarray; 1134 z = zarray; 1135 } 1136 1137 for (i=0; i<mbs; i++) { 1138 n = ii[1] - ii[0]; ii++; 1139 if (usecprow) { 1140 z = zarray + 2*ridx[i]; 1141 y = yarray + 2*ridx[i]; 1142 } 1143 sum1 = y[0]; sum2 = y[1]; 1144 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1145 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1146 for (j=0; j<n; j++) { 1147 xb = x + 2*(*idx++); 1148 x1 = xb[0]; 1149 x2 = xb[1]; 1150 1151 sum1 += v[0]*x1 + v[2]*x2; 1152 sum2 += v[1]*x1 + v[3]*x2; 1153 v += 4; 1154 } 1155 z[0] = sum1; z[1] = sum2; 1156 if (!usecprow) { 1157 z += 2; y += 2; 1158 } 1159 } 1160 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1161 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1167 { 1168 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1169 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1170 const PetscScalar *x,*xb; 1171 const MatScalar *v; 1172 PetscErrorCode ierr; 1173 PetscInt mbs = a->mbs,i,j,n; 1174 const PetscInt *idx,*ii,*ridx = NULL; 1175 PetscBool usecprow = a->compressedrow.use; 1176 1177 PetscFunctionBegin; 1178 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1179 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1180 1181 idx = a->j; 1182 v = a->a; 1183 if (usecprow) { 1184 if (zz != yy) { 1185 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1186 } 1187 mbs = a->compressedrow.nrows; 1188 ii = a->compressedrow.i; 1189 ridx = a->compressedrow.rindex; 1190 } else { 1191 ii = a->i; 1192 y = yarray; 1193 z = zarray; 1194 } 1195 1196 for (i=0; i<mbs; i++) { 1197 n = ii[1] - ii[0]; ii++; 1198 if (usecprow) { 1199 z = zarray + 3*ridx[i]; 1200 y = yarray + 3*ridx[i]; 1201 } 1202 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1203 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1204 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1205 for (j=0; j<n; j++) { 1206 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1207 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1208 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1209 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1210 v += 9; 1211 } 1212 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1213 if (!usecprow) { 1214 z += 3; y += 3; 1215 } 1216 } 1217 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1218 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1219 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1220 PetscFunctionReturn(0); 1221 } 1222 1223 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1224 { 1225 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1226 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1227 const PetscScalar *x,*xb; 1228 const MatScalar *v; 1229 PetscErrorCode ierr; 1230 PetscInt mbs = a->mbs,i,j,n; 1231 const PetscInt *idx,*ii,*ridx=NULL; 1232 PetscBool usecprow=a->compressedrow.use; 1233 1234 PetscFunctionBegin; 1235 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1236 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1237 1238 idx = a->j; 1239 v = a->a; 1240 if (usecprow) { 1241 if (zz != yy) { 1242 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1243 } 1244 mbs = a->compressedrow.nrows; 1245 ii = a->compressedrow.i; 1246 ridx = a->compressedrow.rindex; 1247 } else { 1248 ii = a->i; 1249 y = yarray; 1250 z = zarray; 1251 } 1252 1253 for (i=0; i<mbs; i++) { 1254 n = ii[1] - ii[0]; ii++; 1255 if (usecprow) { 1256 z = zarray + 4*ridx[i]; 1257 y = yarray + 4*ridx[i]; 1258 } 1259 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1260 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1261 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1262 for (j=0; j<n; j++) { 1263 xb = x + 4*(*idx++); 1264 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1265 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1266 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1267 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1268 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1269 v += 16; 1270 } 1271 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1272 if (!usecprow) { 1273 z += 4; y += 4; 1274 } 1275 } 1276 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1277 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1278 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1279 PetscFunctionReturn(0); 1280 } 1281 1282 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1283 { 1284 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1285 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1286 const PetscScalar *x,*xb; 1287 PetscScalar *yarray,*zarray; 1288 const MatScalar *v; 1289 PetscErrorCode ierr; 1290 PetscInt mbs = a->mbs,i,j,n; 1291 const PetscInt *idx,*ii,*ridx = NULL; 1292 PetscBool usecprow=a->compressedrow.use; 1293 1294 PetscFunctionBegin; 1295 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1296 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1297 1298 idx = a->j; 1299 v = a->a; 1300 if (usecprow) { 1301 if (zz != yy) { 1302 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1303 } 1304 mbs = a->compressedrow.nrows; 1305 ii = a->compressedrow.i; 1306 ridx = a->compressedrow.rindex; 1307 } else { 1308 ii = a->i; 1309 y = yarray; 1310 z = zarray; 1311 } 1312 1313 for (i=0; i<mbs; i++) { 1314 n = ii[1] - ii[0]; ii++; 1315 if (usecprow) { 1316 z = zarray + 5*ridx[i]; 1317 y = yarray + 5*ridx[i]; 1318 } 1319 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1320 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1321 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1322 for (j=0; j<n; j++) { 1323 xb = x + 5*(*idx++); 1324 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1325 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1326 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1327 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1328 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1329 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1330 v += 25; 1331 } 1332 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1333 if (!usecprow) { 1334 z += 5; y += 5; 1335 } 1336 } 1337 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1338 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1339 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1340 PetscFunctionReturn(0); 1341 } 1342 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1343 { 1344 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1345 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1346 const PetscScalar *x,*xb; 1347 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1348 const MatScalar *v; 1349 PetscErrorCode ierr; 1350 PetscInt mbs = a->mbs,i,j,n; 1351 const PetscInt *idx,*ii,*ridx=NULL; 1352 PetscBool usecprow=a->compressedrow.use; 1353 1354 PetscFunctionBegin; 1355 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1356 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1357 1358 idx = a->j; 1359 v = a->a; 1360 if (usecprow) { 1361 if (zz != yy) { 1362 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1363 } 1364 mbs = a->compressedrow.nrows; 1365 ii = a->compressedrow.i; 1366 ridx = a->compressedrow.rindex; 1367 } else { 1368 ii = a->i; 1369 y = yarray; 1370 z = zarray; 1371 } 1372 1373 for (i=0; i<mbs; i++) { 1374 n = ii[1] - ii[0]; ii++; 1375 if (usecprow) { 1376 z = zarray + 6*ridx[i]; 1377 y = yarray + 6*ridx[i]; 1378 } 1379 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1380 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1381 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1382 for (j=0; j<n; j++) { 1383 xb = x + 6*(*idx++); 1384 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1385 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1386 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1387 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1388 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1389 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1390 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1391 v += 36; 1392 } 1393 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1394 if (!usecprow) { 1395 z += 6; y += 6; 1396 } 1397 } 1398 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1399 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1400 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1401 PetscFunctionReturn(0); 1402 } 1403 1404 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1405 { 1406 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1407 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1408 const PetscScalar *x,*xb; 1409 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1410 const MatScalar *v; 1411 PetscErrorCode ierr; 1412 PetscInt mbs = a->mbs,i,j,n; 1413 const PetscInt *idx,*ii,*ridx = NULL; 1414 PetscBool usecprow=a->compressedrow.use; 1415 1416 PetscFunctionBegin; 1417 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1418 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1419 1420 idx = a->j; 1421 v = a->a; 1422 if (usecprow) { 1423 if (zz != yy) { 1424 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1425 } 1426 mbs = a->compressedrow.nrows; 1427 ii = a->compressedrow.i; 1428 ridx = a->compressedrow.rindex; 1429 } else { 1430 ii = a->i; 1431 y = yarray; 1432 z = zarray; 1433 } 1434 1435 for (i=0; i<mbs; i++) { 1436 n = ii[1] - ii[0]; ii++; 1437 if (usecprow) { 1438 z = zarray + 7*ridx[i]; 1439 y = yarray + 7*ridx[i]; 1440 } 1441 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1442 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1443 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1444 for (j=0; j<n; j++) { 1445 xb = x + 7*(*idx++); 1446 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1447 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1448 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1449 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1450 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1451 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1452 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1453 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1454 v += 49; 1455 } 1456 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1457 if (!usecprow) { 1458 z += 7; y += 7; 1459 } 1460 } 1461 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1462 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1463 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1464 PetscFunctionReturn(0); 1465 } 1466 1467 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1468 { 1469 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1470 PetscScalar *z = 0,*work,*workt,*zarray; 1471 const PetscScalar *x,*xb; 1472 const MatScalar *v; 1473 PetscErrorCode ierr; 1474 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1475 PetscInt ncols,k; 1476 const PetscInt *ridx = NULL,*idx,*ii; 1477 PetscBool usecprow = a->compressedrow.use; 1478 1479 PetscFunctionBegin; 1480 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1481 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1482 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1483 1484 idx = a->j; 1485 v = a->a; 1486 if (usecprow) { 1487 mbs = a->compressedrow.nrows; 1488 ii = a->compressedrow.i; 1489 ridx = a->compressedrow.rindex; 1490 } else { 1491 mbs = a->mbs; 1492 ii = a->i; 1493 z = zarray; 1494 } 1495 1496 if (!a->mult_work) { 1497 k = PetscMax(A->rmap->n,A->cmap->n); 1498 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1499 } 1500 work = a->mult_work; 1501 for (i=0; i<mbs; i++) { 1502 n = ii[1] - ii[0]; ii++; 1503 ncols = n*bs; 1504 workt = work; 1505 for (j=0; j<n; j++) { 1506 xb = x + bs*(*idx++); 1507 for (k=0; k<bs; k++) workt[k] = xb[k]; 1508 workt += bs; 1509 } 1510 if (usecprow) z = zarray + bs*ridx[i]; 1511 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1512 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1513 v += n*bs2; 1514 if (!usecprow) z += bs; 1515 } 1516 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1517 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1518 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1519 PetscFunctionReturn(0); 1520 } 1521 1522 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1523 { 1524 PetscScalar zero = 0.0; 1525 PetscErrorCode ierr; 1526 1527 PetscFunctionBegin; 1528 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1529 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1530 PetscFunctionReturn(0); 1531 } 1532 1533 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1534 { 1535 PetscScalar zero = 0.0; 1536 PetscErrorCode ierr; 1537 1538 PetscFunctionBegin; 1539 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1540 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1541 PetscFunctionReturn(0); 1542 } 1543 1544 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1545 { 1546 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1547 PetscScalar *z,x1,x2,x3,x4,x5; 1548 const PetscScalar *x,*xb = NULL; 1549 const MatScalar *v; 1550 PetscErrorCode ierr; 1551 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1552 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1553 Mat_CompressedRow cprow = a->compressedrow; 1554 PetscBool usecprow = cprow.use; 1555 1556 PetscFunctionBegin; 1557 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1558 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1559 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1560 1561 idx = a->j; 1562 v = a->a; 1563 if (usecprow) { 1564 mbs = cprow.nrows; 1565 ii = cprow.i; 1566 ridx = cprow.rindex; 1567 } else { 1568 mbs=a->mbs; 1569 ii = a->i; 1570 xb = x; 1571 } 1572 1573 switch (bs) { 1574 case 1: 1575 for (i=0; i<mbs; i++) { 1576 if (usecprow) xb = x + ridx[i]; 1577 x1 = xb[0]; 1578 ib = idx + ii[0]; 1579 n = ii[1] - ii[0]; ii++; 1580 for (j=0; j<n; j++) { 1581 rval = ib[j]; 1582 z[rval] += PetscConj(*v) * x1; 1583 v++; 1584 } 1585 if (!usecprow) xb++; 1586 } 1587 break; 1588 case 2: 1589 for (i=0; i<mbs; i++) { 1590 if (usecprow) xb = x + 2*ridx[i]; 1591 x1 = xb[0]; x2 = xb[1]; 1592 ib = idx + ii[0]; 1593 n = ii[1] - ii[0]; ii++; 1594 for (j=0; j<n; j++) { 1595 rval = ib[j]*2; 1596 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1597 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1598 v += 4; 1599 } 1600 if (!usecprow) xb += 2; 1601 } 1602 break; 1603 case 3: 1604 for (i=0; i<mbs; i++) { 1605 if (usecprow) xb = x + 3*ridx[i]; 1606 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1607 ib = idx + ii[0]; 1608 n = ii[1] - ii[0]; ii++; 1609 for (j=0; j<n; j++) { 1610 rval = ib[j]*3; 1611 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1612 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1613 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1614 v += 9; 1615 } 1616 if (!usecprow) xb += 3; 1617 } 1618 break; 1619 case 4: 1620 for (i=0; i<mbs; i++) { 1621 if (usecprow) xb = x + 4*ridx[i]; 1622 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1623 ib = idx + ii[0]; 1624 n = ii[1] - ii[0]; ii++; 1625 for (j=0; j<n; j++) { 1626 rval = ib[j]*4; 1627 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1628 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1629 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1630 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1631 v += 16; 1632 } 1633 if (!usecprow) xb += 4; 1634 } 1635 break; 1636 case 5: 1637 for (i=0; i<mbs; i++) { 1638 if (usecprow) xb = x + 5*ridx[i]; 1639 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1640 x4 = xb[3]; x5 = xb[4]; 1641 ib = idx + ii[0]; 1642 n = ii[1] - ii[0]; ii++; 1643 for (j=0; j<n; j++) { 1644 rval = ib[j]*5; 1645 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1646 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1647 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1648 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1649 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1650 v += 25; 1651 } 1652 if (!usecprow) xb += 5; 1653 } 1654 break; 1655 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1656 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1657 #if 0 1658 { 1659 PetscInt ncols,k,bs2=a->bs2; 1660 PetscScalar *work,*workt,zb; 1661 const PetscScalar *xtmp; 1662 if (!a->mult_work) { 1663 k = PetscMax(A->rmap->n,A->cmap->n); 1664 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1665 } 1666 work = a->mult_work; 1667 xtmp = x; 1668 for (i=0; i<mbs; i++) { 1669 n = ii[1] - ii[0]; ii++; 1670 ncols = n*bs; 1671 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1672 if (usecprow) xtmp = x + bs*ridx[i]; 1673 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1674 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1675 v += n*bs2; 1676 if (!usecprow) xtmp += bs; 1677 workt = work; 1678 for (j=0; j<n; j++) { 1679 zb = z + bs*(*idx++); 1680 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1681 workt += bs; 1682 } 1683 } 1684 } 1685 #endif 1686 } 1687 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1688 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1689 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1690 PetscFunctionReturn(0); 1691 } 1692 1693 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1694 { 1695 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1696 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1697 const PetscScalar *x,*xb = 0; 1698 const MatScalar *v; 1699 PetscErrorCode ierr; 1700 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1701 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1702 Mat_CompressedRow cprow = a->compressedrow; 1703 PetscBool usecprow=cprow.use; 1704 1705 PetscFunctionBegin; 1706 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1707 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1708 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1709 1710 idx = a->j; 1711 v = a->a; 1712 if (usecprow) { 1713 mbs = cprow.nrows; 1714 ii = cprow.i; 1715 ridx = cprow.rindex; 1716 } else { 1717 mbs=a->mbs; 1718 ii = a->i; 1719 xb = x; 1720 } 1721 1722 switch (bs) { 1723 case 1: 1724 for (i=0; i<mbs; i++) { 1725 if (usecprow) xb = x + ridx[i]; 1726 x1 = xb[0]; 1727 ib = idx + ii[0]; 1728 n = ii[1] - ii[0]; ii++; 1729 for (j=0; j<n; j++) { 1730 rval = ib[j]; 1731 z[rval] += *v * x1; 1732 v++; 1733 } 1734 if (!usecprow) xb++; 1735 } 1736 break; 1737 case 2: 1738 for (i=0; i<mbs; i++) { 1739 if (usecprow) xb = x + 2*ridx[i]; 1740 x1 = xb[0]; x2 = xb[1]; 1741 ib = idx + ii[0]; 1742 n = ii[1] - ii[0]; ii++; 1743 for (j=0; j<n; j++) { 1744 rval = ib[j]*2; 1745 z[rval++] += v[0]*x1 + v[1]*x2; 1746 z[rval++] += v[2]*x1 + v[3]*x2; 1747 v += 4; 1748 } 1749 if (!usecprow) xb += 2; 1750 } 1751 break; 1752 case 3: 1753 for (i=0; i<mbs; i++) { 1754 if (usecprow) xb = x + 3*ridx[i]; 1755 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1756 ib = idx + ii[0]; 1757 n = ii[1] - ii[0]; ii++; 1758 for (j=0; j<n; j++) { 1759 rval = ib[j]*3; 1760 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1761 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1762 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1763 v += 9; 1764 } 1765 if (!usecprow) xb += 3; 1766 } 1767 break; 1768 case 4: 1769 for (i=0; i<mbs; i++) { 1770 if (usecprow) xb = x + 4*ridx[i]; 1771 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1772 ib = idx + ii[0]; 1773 n = ii[1] - ii[0]; ii++; 1774 for (j=0; j<n; j++) { 1775 rval = ib[j]*4; 1776 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1777 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1778 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1779 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1780 v += 16; 1781 } 1782 if (!usecprow) xb += 4; 1783 } 1784 break; 1785 case 5: 1786 for (i=0; i<mbs; i++) { 1787 if (usecprow) xb = x + 5*ridx[i]; 1788 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1789 x4 = xb[3]; x5 = xb[4]; 1790 ib = idx + ii[0]; 1791 n = ii[1] - ii[0]; ii++; 1792 for (j=0; j<n; j++) { 1793 rval = ib[j]*5; 1794 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1795 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1796 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1797 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1798 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1799 v += 25; 1800 } 1801 if (!usecprow) xb += 5; 1802 } 1803 break; 1804 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1805 PetscInt ncols,k; 1806 PetscScalar *work,*workt; 1807 const PetscScalar *xtmp; 1808 if (!a->mult_work) { 1809 k = PetscMax(A->rmap->n,A->cmap->n); 1810 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1811 } 1812 work = a->mult_work; 1813 xtmp = x; 1814 for (i=0; i<mbs; i++) { 1815 n = ii[1] - ii[0]; ii++; 1816 ncols = n*bs; 1817 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1818 if (usecprow) xtmp = x + bs*ridx[i]; 1819 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1820 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1821 v += n*bs2; 1822 if (!usecprow) xtmp += bs; 1823 workt = work; 1824 for (j=0; j<n; j++) { 1825 zb = z + bs*(*idx++); 1826 for (k=0; k<bs; k++) zb[k] += workt[k]; 1827 workt += bs; 1828 } 1829 } 1830 } 1831 } 1832 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1833 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1834 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1835 PetscFunctionReturn(0); 1836 } 1837 1838 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1839 { 1840 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1841 PetscInt totalnz = a->bs2*a->nz; 1842 PetscScalar oalpha = alpha; 1843 PetscErrorCode ierr; 1844 PetscBLASInt one = 1,tnz; 1845 1846 PetscFunctionBegin; 1847 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1848 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1849 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1850 PetscFunctionReturn(0); 1851 } 1852 1853 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1854 { 1855 PetscErrorCode ierr; 1856 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1857 MatScalar *v = a->a; 1858 PetscReal sum = 0.0; 1859 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1860 1861 PetscFunctionBegin; 1862 if (type == NORM_FROBENIUS) { 1863 #if defined(PETSC_USE_REAL___FP16) 1864 PetscBLASInt one = 1,cnt = bs2*nz; 1865 *norm = BLASnrm2_(&cnt,v,&one); 1866 #else 1867 for (i=0; i<bs2*nz; i++) { 1868 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1869 } 1870 #endif 1871 *norm = PetscSqrtReal(sum); 1872 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 1873 } else if (type == NORM_1) { /* maximum column sum */ 1874 PetscReal *tmp; 1875 PetscInt *bcol = a->j; 1876 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 1877 for (i=0; i<nz; i++) { 1878 for (j=0; j<bs; j++) { 1879 k1 = bs*(*bcol) + j; /* column index */ 1880 for (k=0; k<bs; k++) { 1881 tmp[k1] += PetscAbsScalar(*v); v++; 1882 } 1883 } 1884 bcol++; 1885 } 1886 *norm = 0.0; 1887 for (j=0; j<A->cmap->n; j++) { 1888 if (tmp[j] > *norm) *norm = tmp[j]; 1889 } 1890 ierr = PetscFree(tmp);CHKERRQ(ierr); 1891 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 1892 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1893 *norm = 0.0; 1894 for (k=0; k<bs; k++) { 1895 for (j=0; j<a->mbs; j++) { 1896 v = a->a + bs2*a->i[j] + k; 1897 sum = 0.0; 1898 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1899 for (k1=0; k1<bs; k1++) { 1900 sum += PetscAbsScalar(*v); 1901 v += bs; 1902 } 1903 } 1904 if (sum > *norm) *norm = sum; 1905 } 1906 } 1907 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 1908 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1909 PetscFunctionReturn(0); 1910 } 1911 1912 1913 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1914 { 1915 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1916 PetscErrorCode ierr; 1917 1918 PetscFunctionBegin; 1919 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1920 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1921 *flg = PETSC_FALSE; 1922 PetscFunctionReturn(0); 1923 } 1924 1925 /* if the a->i are the same */ 1926 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1927 if (!*flg) PetscFunctionReturn(0); 1928 1929 /* if a->j are the same */ 1930 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1931 if (!*flg) PetscFunctionReturn(0); 1932 1933 /* if a->a are the same */ 1934 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1935 PetscFunctionReturn(0); 1936 1937 } 1938 1939 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1940 { 1941 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1942 PetscErrorCode ierr; 1943 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1944 PetscScalar *x,zero = 0.0; 1945 MatScalar *aa,*aa_j; 1946 1947 PetscFunctionBegin; 1948 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1949 bs = A->rmap->bs; 1950 aa = a->a; 1951 ai = a->i; 1952 aj = a->j; 1953 ambs = a->mbs; 1954 bs2 = a->bs2; 1955 1956 ierr = VecSet(v,zero);CHKERRQ(ierr); 1957 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1958 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1959 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1960 for (i=0; i<ambs; i++) { 1961 for (j=ai[i]; j<ai[i+1]; j++) { 1962 if (aj[j] == i) { 1963 row = i*bs; 1964 aa_j = aa+j*bs2; 1965 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1966 break; 1967 } 1968 } 1969 } 1970 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1971 PetscFunctionReturn(0); 1972 } 1973 1974 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 1975 { 1976 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1977 const PetscScalar *l,*r,*li,*ri; 1978 PetscScalar x; 1979 MatScalar *aa, *v; 1980 PetscErrorCode ierr; 1981 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 1982 const PetscInt *ai,*aj; 1983 1984 PetscFunctionBegin; 1985 ai = a->i; 1986 aj = a->j; 1987 aa = a->a; 1988 m = A->rmap->n; 1989 n = A->cmap->n; 1990 bs = A->rmap->bs; 1991 mbs = a->mbs; 1992 bs2 = a->bs2; 1993 if (ll) { 1994 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1995 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1996 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1997 for (i=0; i<mbs; i++) { /* for each block row */ 1998 M = ai[i+1] - ai[i]; 1999 li = l + i*bs; 2000 v = aa + bs2*ai[i]; 2001 for (j=0; j<M; j++) { /* for each block */ 2002 for (k=0; k<bs2; k++) { 2003 (*v++) *= li[k%bs]; 2004 } 2005 } 2006 } 2007 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2008 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2009 } 2010 2011 if (rr) { 2012 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2013 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2014 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2015 for (i=0; i<mbs; i++) { /* for each block row */ 2016 iai = ai[i]; 2017 M = ai[i+1] - iai; 2018 v = aa + bs2*iai; 2019 for (j=0; j<M; j++) { /* for each block */ 2020 ri = r + bs*aj[iai+j]; 2021 for (k=0; k<bs; k++) { 2022 x = ri[k]; 2023 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2024 v += bs; 2025 } 2026 } 2027 } 2028 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2029 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2030 } 2031 PetscFunctionReturn(0); 2032 } 2033 2034 2035 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2036 { 2037 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2038 2039 PetscFunctionBegin; 2040 info->block_size = a->bs2; 2041 info->nz_allocated = a->bs2*a->maxnz; 2042 info->nz_used = a->bs2*a->nz; 2043 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2044 info->assemblies = A->num_ass; 2045 info->mallocs = A->info.mallocs; 2046 info->memory = ((PetscObject)A)->mem; 2047 if (A->factortype) { 2048 info->fill_ratio_given = A->info.fill_ratio_given; 2049 info->fill_ratio_needed = A->info.fill_ratio_needed; 2050 info->factor_mallocs = A->info.factor_mallocs; 2051 } else { 2052 info->fill_ratio_given = 0; 2053 info->fill_ratio_needed = 0; 2054 info->factor_mallocs = 0; 2055 } 2056 PetscFunctionReturn(0); 2057 } 2058 2059 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2060 { 2061 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2062 PetscErrorCode ierr; 2063 2064 PetscFunctionBegin; 2065 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2066 PetscFunctionReturn(0); 2067 } 2068