1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m,&table)); 28 PetscCall(PetscMalloc1(m+1,&nidx)); 29 PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2)); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 PetscCall(PetscBTMemzero(m,table)); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 PetscCall(ISGetIndices(is[i],&idx)); 38 PetscCall(ISGetLocalSize(is[i],&n)); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n; ++j) { 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 PetscCheck(ival<m,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 45 } 46 PetscCall(ISRestoreIndices(is[i],&idx)); 47 PetscCall(ISDestroy(&is[i])); 48 49 k = 0; 50 for (j=0; j<ov; j++) { /* for each overlap*/ 51 n = isz; 52 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end; l++) { 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i)); 67 } 68 PetscCall(PetscBTDestroy(&table)); 69 PetscCall(PetscFree(nidx)); 70 PetscCall(PetscFree(nidx2)); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 75 { 76 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 77 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 78 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 79 const PetscInt *irow,*icol; 80 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 81 PetscInt *aj = a->j,*ai = a->i; 82 MatScalar *mat_a; 83 Mat C; 84 PetscBool flag; 85 86 PetscFunctionBegin; 87 PetscCall(ISGetIndices(isrow,&irow)); 88 PetscCall(ISGetIndices(iscol,&icol)); 89 PetscCall(ISGetLocalSize(isrow,&nrows)); 90 PetscCall(ISGetLocalSize(iscol,&ncols)); 91 92 PetscCall(PetscCalloc1(1+oldcols,&smap)); 93 ssmap = smap; 94 PetscCall(PetscMalloc1(1+nrows,&lens)); 95 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 96 /* determine lens of each row */ 97 for (i=0; i<nrows; i++) { 98 kstart = ai[irow[i]]; 99 kend = kstart + a->ilen[irow[i]]; 100 lens[i] = 0; 101 for (k=kstart; k<kend; k++) { 102 if (ssmap[aj[k]]) lens[i]++; 103 } 104 } 105 /* Create and fill new matrix */ 106 if (scall == MAT_REUSE_MATRIX) { 107 c = (Mat_SeqBAIJ*)((*B)->data); 108 109 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 110 PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag)); 111 PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 112 PetscCall(PetscArrayzero(c->ilen,c->mbs)); 113 C = *B; 114 } else { 115 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C)); 116 PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE)); 117 PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 118 PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens)); 119 } 120 c = (Mat_SeqBAIJ*)(C->data); 121 for (i=0; i<nrows; i++) { 122 row = irow[i]; 123 kstart = ai[row]; 124 kend = kstart + a->ilen[row]; 125 mat_i = c->i[i]; 126 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 127 mat_a = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */ 128 mat_ilen = c->ilen + i; 129 for (k=kstart; k<kend; k++) { 130 if ((tcol=ssmap[a->j[k]])) { 131 *mat_j++ = tcol - 1; 132 PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2)); 133 mat_a += bs2; 134 (*mat_ilen)++; 135 } 136 } 137 } 138 /* sort */ 139 if (c->j && c->a) { 140 MatScalar *work; 141 PetscCall(PetscMalloc1(bs2,&work)); 142 for (i=0; i<nrows; i++) { 143 PetscInt ilen; 144 mat_i = c->i[i]; 145 mat_j = c->j + mat_i; 146 mat_a = c->a + mat_i*bs2; 147 ilen = c->ilen[i]; 148 PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); 149 } 150 PetscCall(PetscFree(work)); 151 } 152 153 /* Free work space */ 154 PetscCall(ISRestoreIndices(iscol,&icol)); 155 PetscCall(PetscFree(smap)); 156 PetscCall(PetscFree(lens)); 157 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 158 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 159 160 PetscCall(ISRestoreIndices(isrow,&irow)); 161 *B = C; 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 168 IS is1,is2; 169 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 170 const PetscInt *irow,*icol; 171 172 PetscFunctionBegin; 173 PetscCall(ISGetIndices(isrow,&irow)); 174 PetscCall(ISGetIndices(iscol,&icol)); 175 PetscCall(ISGetLocalSize(isrow,&nrows)); 176 PetscCall(ISGetLocalSize(iscol,&ncols)); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 maxmnbs = PetscMax(a->mbs,a->nbs); 181 PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary)); 182 PetscCall(PetscArrayzero(vary,a->mbs)); 183 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 184 for (i=0; i<a->mbs; i++) { 185 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 186 } 187 count = 0; 188 for (i=0; i<nrows; i++) { 189 j = irow[i] / bs; 190 if ((vary[j]--)==bs) iary[count++] = j; 191 } 192 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1)); 193 194 PetscCall(PetscArrayzero(vary,a->nbs)); 195 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 196 for (i=0; i<a->nbs; i++) { 197 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 198 } 199 count = 0; 200 for (i=0; i<ncols; i++) { 201 j = icol[i] / bs; 202 if ((vary[j]--)==bs) iary[count++] = j; 203 } 204 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2)); 205 PetscCall(ISRestoreIndices(isrow,&irow)); 206 PetscCall(ISRestoreIndices(iscol,&icol)); 207 PetscCall(PetscFree2(vary,iary)); 208 209 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B)); 210 PetscCall(ISDestroy(&is1)); 211 PetscCall(ISDestroy(&is2)); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 216 { 217 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 218 Mat_SubSppt *submatj = c->submatis1; 219 220 PetscFunctionBegin; 221 PetscCall((*submatj->destroy)(C)); 222 PetscCall(MatDestroySubMatrix_Private(submatj)); 223 PetscFunctionReturn(0); 224 } 225 226 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 227 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 228 { 229 PetscInt i; 230 Mat C; 231 Mat_SeqBAIJ *c; 232 Mat_SubSppt *submatj; 233 234 PetscFunctionBegin; 235 for (i=0; i<n; i++) { 236 C = (*mat)[i]; 237 c = (Mat_SeqBAIJ*)C->data; 238 submatj = c->submatis1; 239 if (submatj) { 240 if (--((PetscObject)C)->refct <= 0) { 241 PetscCall(PetscFree(C->factorprefix)); 242 PetscCall((*submatj->destroy)(C)); 243 PetscCall(MatDestroySubMatrix_Private(submatj)); 244 PetscCall(PetscFree(C->defaultvectype)); 245 PetscCall(PetscLayoutDestroy(&C->rmap)); 246 PetscCall(PetscLayoutDestroy(&C->cmap)); 247 PetscCall(PetscHeaderDestroy(&C)); 248 } 249 } else { 250 PetscCall(MatDestroy(&C)); 251 } 252 } 253 254 /* Destroy Dummy submatrices created for reuse */ 255 PetscCall(MatDestroySubMatrices_Dummy(n,mat)); 256 257 PetscCall(PetscFree(*mat)); 258 PetscFunctionReturn(0); 259 } 260 261 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 262 { 263 PetscInt i; 264 265 PetscFunctionBegin; 266 if (scall == MAT_INITIAL_MATRIX) { 267 PetscCall(PetscCalloc1(n+1,B)); 268 } 269 270 for (i=0; i<n; i++) { 271 PetscCall(MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i])); 272 } 273 PetscFunctionReturn(0); 274 } 275 276 /* -------------------------------------------------------*/ 277 /* Should check that shapes of vectors and matrices match */ 278 /* -------------------------------------------------------*/ 279 280 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 281 { 282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 283 PetscScalar *z,sum; 284 const PetscScalar *x; 285 const MatScalar *v; 286 PetscInt mbs,i,n; 287 const PetscInt *idx,*ii,*ridx=NULL; 288 PetscBool usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 PetscCall(VecGetArrayRead(xx,&x)); 292 PetscCall(VecGetArrayWrite(zz,&z)); 293 294 if (usecprow) { 295 mbs = a->compressedrow.nrows; 296 ii = a->compressedrow.i; 297 ridx = a->compressedrow.rindex; 298 PetscCall(PetscArrayzero(z,a->mbs)); 299 } else { 300 mbs = a->mbs; 301 ii = a->i; 302 } 303 304 for (i=0; i<mbs; i++) { 305 n = ii[1] - ii[0]; 306 v = a->a + ii[0]; 307 idx = a->j + ii[0]; 308 ii++; 309 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 310 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 311 sum = 0.0; 312 PetscSparseDensePlusDot(sum,x,v,idx,n); 313 if (usecprow) { 314 z[ridx[i]] = sum; 315 } else { 316 z[i] = sum; 317 } 318 } 319 PetscCall(VecRestoreArrayRead(xx,&x)); 320 PetscCall(VecRestoreArrayWrite(zz,&z)); 321 PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt)); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 326 { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 328 PetscScalar *z = NULL,sum1,sum2,*zarray; 329 const PetscScalar *x,*xb; 330 PetscScalar x1,x2; 331 const MatScalar *v; 332 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 333 PetscBool usecprow=a->compressedrow.use; 334 335 PetscFunctionBegin; 336 PetscCall(VecGetArrayRead(xx,&x)); 337 PetscCall(VecGetArrayWrite(zz,&zarray)); 338 339 idx = a->j; 340 v = a->a; 341 if (usecprow) { 342 mbs = a->compressedrow.nrows; 343 ii = a->compressedrow.i; 344 ridx = a->compressedrow.rindex; 345 PetscCall(PetscArrayzero(zarray,2*a->mbs)); 346 } else { 347 mbs = a->mbs; 348 ii = a->i; 349 z = zarray; 350 } 351 352 for (i=0; i<mbs; i++) { 353 n = ii[1] - ii[0]; ii++; 354 sum1 = 0.0; sum2 = 0.0; 355 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 356 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 357 for (j=0; j<n; j++) { 358 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 359 sum1 += v[0]*x1 + v[2]*x2; 360 sum2 += v[1]*x1 + v[3]*x2; 361 v += 4; 362 } 363 if (usecprow) z = zarray + 2*ridx[i]; 364 z[0] = sum1; z[1] = sum2; 365 if (!usecprow) z += 2; 366 } 367 PetscCall(VecRestoreArrayRead(xx,&x)); 368 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 369 PetscCall(PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt)); 370 PetscFunctionReturn(0); 371 } 372 373 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 374 { 375 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 376 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 377 const PetscScalar *x,*xb; 378 const MatScalar *v; 379 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 380 PetscBool usecprow=a->compressedrow.use; 381 382 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 383 #pragma disjoint(*v,*z,*xb) 384 #endif 385 386 PetscFunctionBegin; 387 PetscCall(VecGetArrayRead(xx,&x)); 388 PetscCall(VecGetArrayWrite(zz,&zarray)); 389 390 idx = a->j; 391 v = a->a; 392 if (usecprow) { 393 mbs = a->compressedrow.nrows; 394 ii = a->compressedrow.i; 395 ridx = a->compressedrow.rindex; 396 PetscCall(PetscArrayzero(zarray,3*a->mbs)); 397 } else { 398 mbs = a->mbs; 399 ii = a->i; 400 z = zarray; 401 } 402 403 for (i=0; i<mbs; i++) { 404 n = ii[1] - ii[0]; ii++; 405 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 406 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 407 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 408 for (j=0; j<n; j++) { 409 xb = x + 3*(*idx++); 410 x1 = xb[0]; 411 x2 = xb[1]; 412 x3 = xb[2]; 413 414 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 415 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 416 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 417 v += 9; 418 } 419 if (usecprow) z = zarray + 3*ridx[i]; 420 z[0] = sum1; z[1] = sum2; z[2] = sum3; 421 if (!usecprow) z += 3; 422 } 423 PetscCall(VecRestoreArrayRead(xx,&x)); 424 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 425 PetscCall(PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt)); 426 PetscFunctionReturn(0); 427 } 428 429 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 430 { 431 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 432 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 433 const PetscScalar *x,*xb; 434 const MatScalar *v; 435 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 436 PetscBool usecprow=a->compressedrow.use; 437 438 PetscFunctionBegin; 439 PetscCall(VecGetArrayRead(xx,&x)); 440 PetscCall(VecGetArrayWrite(zz,&zarray)); 441 442 idx = a->j; 443 v = a->a; 444 if (usecprow) { 445 mbs = a->compressedrow.nrows; 446 ii = a->compressedrow.i; 447 ridx = a->compressedrow.rindex; 448 PetscCall(PetscArrayzero(zarray,4*a->mbs)); 449 } else { 450 mbs = a->mbs; 451 ii = a->i; 452 z = zarray; 453 } 454 455 for (i=0; i<mbs; i++) { 456 n = ii[1] - ii[0]; 457 ii++; 458 sum1 = 0.0; 459 sum2 = 0.0; 460 sum3 = 0.0; 461 sum4 = 0.0; 462 463 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 464 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 465 for (j=0; j<n; j++) { 466 xb = x + 4*(*idx++); 467 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 468 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 469 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 470 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 471 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 472 v += 16; 473 } 474 if (usecprow) z = zarray + 4*ridx[i]; 475 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 476 if (!usecprow) z += 4; 477 } 478 PetscCall(VecRestoreArrayRead(xx,&x)); 479 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 480 PetscCall(PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt)); 481 PetscFunctionReturn(0); 482 } 483 484 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 485 { 486 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 487 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 488 const PetscScalar *xb,*x; 489 const MatScalar *v; 490 const PetscInt *idx,*ii,*ridx=NULL; 491 PetscInt mbs,i,j,n; 492 PetscBool usecprow=a->compressedrow.use; 493 494 PetscFunctionBegin; 495 PetscCall(VecGetArrayRead(xx,&x)); 496 PetscCall(VecGetArrayWrite(zz,&zarray)); 497 498 idx = a->j; 499 v = a->a; 500 if (usecprow) { 501 mbs = a->compressedrow.nrows; 502 ii = a->compressedrow.i; 503 ridx = a->compressedrow.rindex; 504 PetscCall(PetscArrayzero(zarray,5*a->mbs)); 505 } else { 506 mbs = a->mbs; 507 ii = a->i; 508 z = zarray; 509 } 510 511 for (i=0; i<mbs; i++) { 512 n = ii[1] - ii[0]; ii++; 513 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 514 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 515 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 516 for (j=0; j<n; j++) { 517 xb = x + 5*(*idx++); 518 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 519 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 520 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 521 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 522 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 523 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 524 v += 25; 525 } 526 if (usecprow) z = zarray + 5*ridx[i]; 527 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 528 if (!usecprow) z += 5; 529 } 530 PetscCall(VecRestoreArrayRead(xx,&x)); 531 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 532 PetscCall(PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt)); 533 PetscFunctionReturn(0); 534 } 535 536 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 537 { 538 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 539 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 540 const PetscScalar *x,*xb; 541 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 542 const MatScalar *v; 543 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 544 PetscBool usecprow=a->compressedrow.use; 545 546 PetscFunctionBegin; 547 PetscCall(VecGetArrayRead(xx,&x)); 548 PetscCall(VecGetArrayWrite(zz,&zarray)); 549 550 idx = a->j; 551 v = a->a; 552 if (usecprow) { 553 mbs = a->compressedrow.nrows; 554 ii = a->compressedrow.i; 555 ridx = a->compressedrow.rindex; 556 PetscCall(PetscArrayzero(zarray,6*a->mbs)); 557 } else { 558 mbs = a->mbs; 559 ii = a->i; 560 z = zarray; 561 } 562 563 for (i=0; i<mbs; i++) { 564 n = ii[1] - ii[0]; 565 ii++; 566 sum1 = 0.0; 567 sum2 = 0.0; 568 sum3 = 0.0; 569 sum4 = 0.0; 570 sum5 = 0.0; 571 sum6 = 0.0; 572 573 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 574 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 575 for (j=0; j<n; j++) { 576 xb = x + 6*(*idx++); 577 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 578 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 579 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 580 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 581 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 582 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 583 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 584 v += 36; 585 } 586 if (usecprow) z = zarray + 6*ridx[i]; 587 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 588 if (!usecprow) z += 6; 589 } 590 591 PetscCall(VecRestoreArrayRead(xx,&x)); 592 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 593 PetscCall(PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt)); 594 PetscFunctionReturn(0); 595 } 596 597 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 598 { 599 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 600 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 601 const PetscScalar *x,*xb; 602 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 603 const MatScalar *v; 604 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 605 PetscBool usecprow=a->compressedrow.use; 606 607 PetscFunctionBegin; 608 PetscCall(VecGetArrayRead(xx,&x)); 609 PetscCall(VecGetArrayWrite(zz,&zarray)); 610 611 idx = a->j; 612 v = a->a; 613 if (usecprow) { 614 mbs = a->compressedrow.nrows; 615 ii = a->compressedrow.i; 616 ridx = a->compressedrow.rindex; 617 PetscCall(PetscArrayzero(zarray,7*a->mbs)); 618 } else { 619 mbs = a->mbs; 620 ii = a->i; 621 z = zarray; 622 } 623 624 for (i=0; i<mbs; i++) { 625 n = ii[1] - ii[0]; 626 ii++; 627 sum1 = 0.0; 628 sum2 = 0.0; 629 sum3 = 0.0; 630 sum4 = 0.0; 631 sum5 = 0.0; 632 sum6 = 0.0; 633 sum7 = 0.0; 634 635 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 636 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 637 for (j=0; j<n; j++) { 638 xb = x + 7*(*idx++); 639 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 640 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 641 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 642 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 643 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 644 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 645 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 646 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 647 v += 49; 648 } 649 if (usecprow) z = zarray + 7*ridx[i]; 650 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 651 if (!usecprow) z += 7; 652 } 653 654 PetscCall(VecRestoreArrayRead(xx,&x)); 655 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 656 PetscCall(PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt)); 657 PetscFunctionReturn(0); 658 } 659 660 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 661 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 662 { 663 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 664 PetscScalar *z = NULL,*work,*workt,*zarray; 665 const PetscScalar *x,*xb; 666 const MatScalar *v; 667 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 668 const PetscInt *idx,*ii,*ridx=NULL; 669 PetscInt k; 670 PetscBool usecprow=a->compressedrow.use; 671 672 __m256d a0,a1,a2,a3,a4,a5; 673 __m256d w0,w1,w2,w3; 674 __m256d z0,z1,z2; 675 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 676 677 PetscFunctionBegin; 678 PetscCall(VecGetArrayRead(xx,&x)); 679 PetscCall(VecGetArrayWrite(zz,&zarray)); 680 681 idx = a->j; 682 v = a->a; 683 if (usecprow) { 684 mbs = a->compressedrow.nrows; 685 ii = a->compressedrow.i; 686 ridx = a->compressedrow.rindex; 687 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 688 } else { 689 mbs = a->mbs; 690 ii = a->i; 691 z = zarray; 692 } 693 694 if (!a->mult_work) { 695 k = PetscMax(A->rmap->n,A->cmap->n); 696 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 697 } 698 699 work = a->mult_work; 700 for (i=0; i<mbs; i++) { 701 n = ii[1] - ii[0]; ii++; 702 workt = work; 703 for (j=0; j<n; j++) { 704 xb = x + bs*(*idx++); 705 for (k=0; k<bs; k++) workt[k] = xb[k]; 706 workt += bs; 707 } 708 if (usecprow) z = zarray + bs*ridx[i]; 709 710 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 711 712 for (j=0; j<n; j++) { 713 /* first column of a */ 714 w0 = _mm256_set1_pd(work[j*9 ]); 715 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 716 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 717 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 718 719 /* second column of a */ 720 w1 = _mm256_set1_pd(work[j*9+ 1]); 721 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 722 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 723 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 724 725 /* third column of a */ 726 w2 = _mm256_set1_pd(work[j*9 +2]); 727 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 728 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 729 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 730 731 /* fourth column of a */ 732 w3 = _mm256_set1_pd(work[j*9+ 3]); 733 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 734 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 735 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 736 737 /* fifth column of a */ 738 w0 = _mm256_set1_pd(work[j*9+ 4]); 739 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 740 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 741 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 742 743 /* sixth column of a */ 744 w1 = _mm256_set1_pd(work[j*9+ 5]); 745 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 746 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 747 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 748 749 /* seventh column of a */ 750 w2 = _mm256_set1_pd(work[j*9+ 6]); 751 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 752 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 753 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 754 755 /* eighth column of a */ 756 w3 = _mm256_set1_pd(work[j*9+ 7]); 757 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 758 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 759 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 760 761 /* ninth column of a */ 762 w0 = _mm256_set1_pd(work[j*9+ 8]); 763 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 764 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 765 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 766 } 767 768 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 769 770 v += n*bs2; 771 if (!usecprow) z += bs; 772 } 773 PetscCall(VecRestoreArrayRead(xx,&x)); 774 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 775 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 776 PetscFunctionReturn(0); 777 } 778 #endif 779 780 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 781 { 782 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 783 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 784 const PetscScalar *x,*xb; 785 PetscScalar *zarray,xv; 786 const MatScalar *v; 787 const PetscInt *ii,*ij=a->j,*idx; 788 PetscInt mbs,i,j,k,n,*ridx=NULL; 789 PetscBool usecprow=a->compressedrow.use; 790 791 PetscFunctionBegin; 792 PetscCall(VecGetArrayRead(xx,&x)); 793 PetscCall(VecGetArrayWrite(zz,&zarray)); 794 795 v = a->a; 796 if (usecprow) { 797 mbs = a->compressedrow.nrows; 798 ii = a->compressedrow.i; 799 ridx = a->compressedrow.rindex; 800 PetscCall(PetscArrayzero(zarray,11*a->mbs)); 801 } else { 802 mbs = a->mbs; 803 ii = a->i; 804 z = zarray; 805 } 806 807 for (i=0; i<mbs; i++) { 808 n = ii[i+1] - ii[i]; 809 idx = ij + ii[i]; 810 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 811 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 812 813 for (j=0; j<n; j++) { 814 xb = x + 11*(idx[j]); 815 816 for (k=0; k<11; k++) { 817 xv = xb[k]; 818 sum1 += v[0]*xv; 819 sum2 += v[1]*xv; 820 sum3 += v[2]*xv; 821 sum4 += v[3]*xv; 822 sum5 += v[4]*xv; 823 sum6 += v[5]*xv; 824 sum7 += v[6]*xv; 825 sum8 += v[7]*xv; 826 sum9 += v[8]*xv; 827 sum10 += v[9]*xv; 828 sum11 += v[10]*xv; 829 v += 11; 830 } 831 } 832 if (usecprow) z = zarray + 11*ridx[i]; 833 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 834 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 835 836 if (!usecprow) z += 11; 837 } 838 839 PetscCall(VecRestoreArrayRead(xx,&x)); 840 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 841 PetscCall(PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt)); 842 PetscFunctionReturn(0); 843 } 844 845 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 846 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) 847 { 848 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 849 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 850 const PetscScalar *x,*xb; 851 PetscScalar *zarray,xv; 852 const MatScalar *v; 853 const PetscInt *ii,*ij=a->j,*idx; 854 PetscInt mbs,i,j,k,n,*ridx=NULL; 855 PetscBool usecprow=a->compressedrow.use; 856 857 PetscFunctionBegin; 858 PetscCall(VecGetArrayRead(xx,&x)); 859 PetscCall(VecGetArrayWrite(zz,&zarray)); 860 861 v = a->a; 862 if (usecprow) { 863 mbs = a->compressedrow.nrows; 864 ii = a->compressedrow.i; 865 ridx = a->compressedrow.rindex; 866 PetscCall(PetscArrayzero(zarray,12*a->mbs)); 867 } else { 868 mbs = a->mbs; 869 ii = a->i; 870 z = zarray; 871 } 872 873 for (i=0; i<mbs; i++) { 874 n = ii[i+1] - ii[i]; 875 idx = ij + ii[i]; 876 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 877 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; 878 879 for (j=0; j<n; j++) { 880 xb = x + 12*(idx[j]); 881 882 for (k=0; k<12; k++) { 883 xv = xb[k]; 884 sum1 += v[0]*xv; 885 sum2 += v[1]*xv; 886 sum3 += v[2]*xv; 887 sum4 += v[3]*xv; 888 sum5 += v[4]*xv; 889 sum6 += v[5]*xv; 890 sum7 += v[6]*xv; 891 sum8 += v[7]*xv; 892 sum9 += v[8]*xv; 893 sum10 += v[9]*xv; 894 sum11 += v[10]*xv; 895 sum12 += v[11]*xv; 896 v += 12; 897 } 898 } 899 if (usecprow) z = zarray + 12*ridx[i]; 900 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 901 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 902 if (!usecprow) z += 12; 903 } 904 PetscCall(VecRestoreArrayRead(xx,&x)); 905 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 906 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 907 PetscFunctionReturn(0); 908 } 909 910 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) 911 { 912 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 913 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 914 const PetscScalar *x,*xb; 915 PetscScalar *zarray,*yarray,xv; 916 const MatScalar *v; 917 const PetscInt *ii,*ij=a->j,*idx; 918 PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; 919 PetscBool usecprow=a->compressedrow.use; 920 921 PetscFunctionBegin; 922 PetscCall(VecGetArrayRead(xx,&x)); 923 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 924 925 v = a->a; 926 if (usecprow) { 927 if (zz != yy) { 928 PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); 929 } 930 mbs = a->compressedrow.nrows; 931 ii = a->compressedrow.i; 932 ridx = a->compressedrow.rindex; 933 } else { 934 ii = a->i; 935 y = yarray; 936 z = zarray; 937 } 938 939 for (i=0; i<mbs; i++) { 940 n = ii[i+1] - ii[i]; 941 idx = ij + ii[i]; 942 943 if (usecprow) { 944 y = yarray + 12*ridx[i]; 945 z = zarray + 12*ridx[i]; 946 } 947 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 948 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 949 950 for (j=0; j<n; j++) { 951 xb = x + 12*(idx[j]); 952 953 for (k=0; k<12; k++) { 954 xv = xb[k]; 955 sum1 += v[0]*xv; 956 sum2 += v[1]*xv; 957 sum3 += v[2]*xv; 958 sum4 += v[3]*xv; 959 sum5 += v[4]*xv; 960 sum6 += v[5]*xv; 961 sum7 += v[6]*xv; 962 sum8 += v[7]*xv; 963 sum9 += v[8]*xv; 964 sum10 += v[9]*xv; 965 sum11 += v[10]*xv; 966 sum12 += v[11]*xv; 967 v += 12; 968 } 969 } 970 971 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 972 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 973 if (!usecprow) { 974 y += 12; 975 z += 12; 976 } 977 } 978 PetscCall(VecRestoreArrayRead(xx,&x)); 979 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 980 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 981 PetscFunctionReturn(0); 982 } 983 984 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 985 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) 986 { 987 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 988 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 989 const PetscScalar *x,*xb; 990 PetscScalar x1,x2,x3,x4,*zarray; 991 const MatScalar *v; 992 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 993 PetscInt mbs,i,j,n; 994 PetscBool usecprow=a->compressedrow.use; 995 996 PetscFunctionBegin; 997 PetscCall(VecGetArrayRead(xx,&x)); 998 PetscCall(VecGetArrayWrite(zz,&zarray)); 999 1000 v = a->a; 1001 if (usecprow) { 1002 mbs = a->compressedrow.nrows; 1003 ii = a->compressedrow.i; 1004 ridx = a->compressedrow.rindex; 1005 PetscCall(PetscArrayzero(zarray,12*a->mbs)); 1006 } else { 1007 mbs = a->mbs; 1008 ii = a->i; 1009 z = zarray; 1010 } 1011 1012 for (i=0; i<mbs; i++) { 1013 n = ii[i+1] - ii[i]; 1014 idx = ij + ii[i]; 1015 1016 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1017 for (j=0; j<n; j++) { 1018 xb = x + 12*(idx[j]); 1019 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1020 1021 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1022 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1023 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1024 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1025 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1026 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1027 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1028 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1029 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1030 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1031 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1032 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1033 v += 48; 1034 1035 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1036 1037 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1038 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1039 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1040 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1041 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1042 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1043 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1044 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1045 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1046 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1047 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1048 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1049 v += 48; 1050 1051 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1052 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1053 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1054 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1055 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1056 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1057 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1058 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1059 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1060 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1061 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1062 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1063 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1064 v += 48; 1065 1066 } 1067 if (usecprow) z = zarray + 12*ridx[i]; 1068 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1069 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1070 if (!usecprow) z += 12; 1071 } 1072 PetscCall(VecRestoreArrayRead(xx,&x)); 1073 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1074 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 1075 PetscFunctionReturn(0); 1076 } 1077 1078 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1079 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) 1080 { 1081 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1082 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1083 const PetscScalar *x,*xb; 1084 PetscScalar x1,x2,x3,x4,*zarray,*yarray; 1085 const MatScalar *v; 1086 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1087 PetscInt mbs = a->mbs,i,j,n; 1088 PetscBool usecprow=a->compressedrow.use; 1089 1090 PetscFunctionBegin; 1091 PetscCall(VecGetArrayRead(xx,&x)); 1092 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1093 1094 v = a->a; 1095 if (usecprow) { 1096 if (zz != yy) { 1097 PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); 1098 } 1099 mbs = a->compressedrow.nrows; 1100 ii = a->compressedrow.i; 1101 ridx = a->compressedrow.rindex; 1102 } else { 1103 ii = a->i; 1104 y = yarray; 1105 z = zarray; 1106 } 1107 1108 for (i=0; i<mbs; i++) { 1109 n = ii[i+1] - ii[i]; 1110 idx = ij + ii[i]; 1111 1112 if (usecprow) { 1113 y = yarray + 12*ridx[i]; 1114 z = zarray + 12*ridx[i]; 1115 } 1116 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1117 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 1118 1119 for (j=0; j<n; j++) { 1120 xb = x + 12*(idx[j]); 1121 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1122 1123 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1124 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1125 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1126 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1127 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1128 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1129 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1130 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1131 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1132 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1133 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1134 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1135 v += 48; 1136 1137 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1138 1139 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1140 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1141 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1142 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1143 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1144 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1145 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1146 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1147 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1148 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1149 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1150 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1151 v += 48; 1152 1153 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1154 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1155 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1156 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1157 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1158 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1159 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1160 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1161 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1162 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1163 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1164 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1165 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1166 v += 48; 1167 1168 } 1169 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1170 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1171 if (!usecprow) { 1172 y += 12; 1173 z += 12; 1174 } 1175 } 1176 PetscCall(VecRestoreArrayRead(xx,&x)); 1177 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1178 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 1179 PetscFunctionReturn(0); 1180 } 1181 1182 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1183 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) 1184 { 1185 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1186 PetscScalar *z = NULL,*zarray; 1187 const PetscScalar *x,*work; 1188 const MatScalar *v = a->a; 1189 PetscInt mbs,i,j,n; 1190 const PetscInt *idx = a->j,*ii,*ridx=NULL; 1191 PetscBool usecprow=a->compressedrow.use; 1192 const PetscInt bs = 12, bs2 = 144; 1193 1194 __m256d a0,a1,a2,a3,a4,a5; 1195 __m256d w0,w1,w2,w3; 1196 __m256d z0,z1,z2; 1197 1198 PetscFunctionBegin; 1199 PetscCall(VecGetArrayRead(xx,&x)); 1200 PetscCall(VecGetArrayWrite(zz,&zarray)); 1201 1202 if (usecprow) { 1203 mbs = a->compressedrow.nrows; 1204 ii = a->compressedrow.i; 1205 ridx = a->compressedrow.rindex; 1206 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 1207 } else { 1208 mbs = a->mbs; 1209 ii = a->i; 1210 z = zarray; 1211 } 1212 1213 for (i=0; i<mbs; i++) { 1214 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 1215 1216 n = ii[1] - ii[0]; ii++; 1217 for (j=0; j<n; j++) { 1218 work = x + bs*(*idx++); 1219 1220 /* first column of a */ 1221 w0 = _mm256_set1_pd(work[0]); 1222 a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0); 1223 a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1); 1224 a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2); 1225 1226 /* second column of a */ 1227 w1 = _mm256_set1_pd(work[1]); 1228 a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0); 1229 a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1); 1230 a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2); 1231 1232 /* third column of a */ 1233 w2 = _mm256_set1_pd(work[2]); 1234 a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0); 1235 a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1); 1236 a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2); 1237 1238 /* fourth column of a */ 1239 w3 = _mm256_set1_pd(work[3]); 1240 a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0); 1241 a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1); 1242 a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2); 1243 1244 /* fifth column of a */ 1245 w0 = _mm256_set1_pd(work[4]); 1246 a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0); 1247 a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1); 1248 a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2); 1249 1250 /* sixth column of a */ 1251 w1 = _mm256_set1_pd(work[5]); 1252 a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0); 1253 a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1); 1254 a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2); 1255 1256 /* seventh column of a */ 1257 w2 = _mm256_set1_pd(work[6]); 1258 a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0); 1259 a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1); 1260 a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2); 1261 1262 /* eighth column of a */ 1263 w3 = _mm256_set1_pd(work[7]); 1264 a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0); 1265 a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1); 1266 a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2); 1267 1268 /* ninth column of a */ 1269 w0 = _mm256_set1_pd(work[8]); 1270 a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0); 1271 a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1); 1272 a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2); 1273 1274 /* tenth column of a */ 1275 w1 = _mm256_set1_pd(work[9]); 1276 a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0); 1277 a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1); 1278 a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2); 1279 1280 /* eleventh column of a */ 1281 w2 = _mm256_set1_pd(work[10]); 1282 a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0); 1283 a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1); 1284 a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2); 1285 1286 /* twelveth column of a */ 1287 w3 = _mm256_set1_pd(work[11]); 1288 a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0); 1289 a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1); 1290 a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2); 1291 1292 v += bs2; 1293 } 1294 if (usecprow) z = zarray + bs*ridx[i]; 1295 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2); 1296 if (!usecprow) z += bs; 1297 } 1298 PetscCall(VecRestoreArrayRead(xx,&x)); 1299 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1300 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 1301 PetscFunctionReturn(0); 1302 } 1303 #endif 1304 1305 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1306 /* Default MatMult for block size 15 */ 1307 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1308 { 1309 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1310 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1311 const PetscScalar *x,*xb; 1312 PetscScalar *zarray,xv; 1313 const MatScalar *v; 1314 const PetscInt *ii,*ij=a->j,*idx; 1315 PetscInt mbs,i,j,k,n,*ridx=NULL; 1316 PetscBool usecprow=a->compressedrow.use; 1317 1318 PetscFunctionBegin; 1319 PetscCall(VecGetArrayRead(xx,&x)); 1320 PetscCall(VecGetArrayWrite(zz,&zarray)); 1321 1322 v = a->a; 1323 if (usecprow) { 1324 mbs = a->compressedrow.nrows; 1325 ii = a->compressedrow.i; 1326 ridx = a->compressedrow.rindex; 1327 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1328 } else { 1329 mbs = a->mbs; 1330 ii = a->i; 1331 z = zarray; 1332 } 1333 1334 for (i=0; i<mbs; i++) { 1335 n = ii[i+1] - ii[i]; 1336 idx = ij + ii[i]; 1337 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1338 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1339 1340 for (j=0; j<n; j++) { 1341 xb = x + 15*(idx[j]); 1342 1343 for (k=0; k<15; k++) { 1344 xv = xb[k]; 1345 sum1 += v[0]*xv; 1346 sum2 += v[1]*xv; 1347 sum3 += v[2]*xv; 1348 sum4 += v[3]*xv; 1349 sum5 += v[4]*xv; 1350 sum6 += v[5]*xv; 1351 sum7 += v[6]*xv; 1352 sum8 += v[7]*xv; 1353 sum9 += v[8]*xv; 1354 sum10 += v[9]*xv; 1355 sum11 += v[10]*xv; 1356 sum12 += v[11]*xv; 1357 sum13 += v[12]*xv; 1358 sum14 += v[13]*xv; 1359 sum15 += v[14]*xv; 1360 v += 15; 1361 } 1362 } 1363 if (usecprow) z = zarray + 15*ridx[i]; 1364 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1365 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1366 1367 if (!usecprow) z += 15; 1368 } 1369 1370 PetscCall(VecRestoreArrayRead(xx,&x)); 1371 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1372 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1373 PetscFunctionReturn(0); 1374 } 1375 1376 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1377 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1378 { 1379 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1380 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1381 const PetscScalar *x,*xb; 1382 PetscScalar x1,x2,x3,x4,*zarray; 1383 const MatScalar *v; 1384 const PetscInt *ii,*ij=a->j,*idx; 1385 PetscInt mbs,i,j,n,*ridx=NULL; 1386 PetscBool usecprow=a->compressedrow.use; 1387 1388 PetscFunctionBegin; 1389 PetscCall(VecGetArrayRead(xx,&x)); 1390 PetscCall(VecGetArrayWrite(zz,&zarray)); 1391 1392 v = a->a; 1393 if (usecprow) { 1394 mbs = a->compressedrow.nrows; 1395 ii = a->compressedrow.i; 1396 ridx = a->compressedrow.rindex; 1397 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1398 } else { 1399 mbs = a->mbs; 1400 ii = a->i; 1401 z = zarray; 1402 } 1403 1404 for (i=0; i<mbs; i++) { 1405 n = ii[i+1] - ii[i]; 1406 idx = ij + ii[i]; 1407 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1408 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1409 1410 for (j=0; j<n; j++) { 1411 xb = x + 15*(idx[j]); 1412 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1413 1414 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1415 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1416 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1417 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1418 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1419 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1420 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1421 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1422 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1423 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1424 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1425 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1426 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1427 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1428 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1429 1430 v += 60; 1431 1432 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1433 1434 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1435 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1436 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1437 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1438 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1439 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1440 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1441 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1442 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1443 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1444 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1445 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1446 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1447 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1448 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1449 v += 60; 1450 1451 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1452 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1453 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1454 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1455 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1456 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1457 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1458 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1459 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1460 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1461 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1462 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1463 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1464 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1465 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1466 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1467 v += 60; 1468 1469 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1470 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1471 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1472 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1473 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1474 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1475 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1476 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1477 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1478 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1479 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1480 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1481 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1482 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1483 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1484 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1485 v += 45; 1486 } 1487 if (usecprow) z = zarray + 15*ridx[i]; 1488 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1489 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1490 1491 if (!usecprow) z += 15; 1492 } 1493 1494 PetscCall(VecRestoreArrayRead(xx,&x)); 1495 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1496 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1497 PetscFunctionReturn(0); 1498 } 1499 1500 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1501 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1502 { 1503 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1504 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1505 const PetscScalar *x,*xb; 1506 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1507 const MatScalar *v; 1508 const PetscInt *ii,*ij=a->j,*idx; 1509 PetscInt mbs,i,j,n,*ridx=NULL; 1510 PetscBool usecprow=a->compressedrow.use; 1511 1512 PetscFunctionBegin; 1513 PetscCall(VecGetArrayRead(xx,&x)); 1514 PetscCall(VecGetArrayWrite(zz,&zarray)); 1515 1516 v = a->a; 1517 if (usecprow) { 1518 mbs = a->compressedrow.nrows; 1519 ii = a->compressedrow.i; 1520 ridx = a->compressedrow.rindex; 1521 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1522 } else { 1523 mbs = a->mbs; 1524 ii = a->i; 1525 z = zarray; 1526 } 1527 1528 for (i=0; i<mbs; i++) { 1529 n = ii[i+1] - ii[i]; 1530 idx = ij + ii[i]; 1531 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1532 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1533 1534 for (j=0; j<n; j++) { 1535 xb = x + 15*(idx[j]); 1536 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1537 x8 = xb[7]; 1538 1539 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1540 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1541 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1542 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1543 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1544 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1545 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1546 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1547 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1548 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1549 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1550 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1551 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1552 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1553 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1554 v += 120; 1555 1556 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1557 1558 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1559 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1560 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1561 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1562 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1563 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1564 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1565 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1566 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1567 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1568 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1569 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1570 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1571 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1572 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1573 v += 105; 1574 } 1575 if (usecprow) z = zarray + 15*ridx[i]; 1576 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1577 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1578 1579 if (!usecprow) z += 15; 1580 } 1581 1582 PetscCall(VecRestoreArrayRead(xx,&x)); 1583 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1584 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1585 PetscFunctionReturn(0); 1586 } 1587 1588 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1589 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1590 { 1591 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1592 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1593 const PetscScalar *x,*xb; 1594 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1595 const MatScalar *v; 1596 const PetscInt *ii,*ij=a->j,*idx; 1597 PetscInt mbs,i,j,n,*ridx=NULL; 1598 PetscBool usecprow=a->compressedrow.use; 1599 1600 PetscFunctionBegin; 1601 PetscCall(VecGetArrayRead(xx,&x)); 1602 PetscCall(VecGetArrayWrite(zz,&zarray)); 1603 1604 v = a->a; 1605 if (usecprow) { 1606 mbs = a->compressedrow.nrows; 1607 ii = a->compressedrow.i; 1608 ridx = a->compressedrow.rindex; 1609 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1610 } else { 1611 mbs = a->mbs; 1612 ii = a->i; 1613 z = zarray; 1614 } 1615 1616 for (i=0; i<mbs; i++) { 1617 n = ii[i+1] - ii[i]; 1618 idx = ij + ii[i]; 1619 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1620 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1621 1622 for (j=0; j<n; j++) { 1623 xb = x + 15*(idx[j]); 1624 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1625 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1626 1627 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1628 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1629 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1630 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1631 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1632 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1633 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1634 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1635 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1636 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1637 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1638 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1639 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1640 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1641 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1642 v += 225; 1643 } 1644 if (usecprow) z = zarray + 15*ridx[i]; 1645 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1646 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1647 1648 if (!usecprow) z += 15; 1649 } 1650 1651 PetscCall(VecRestoreArrayRead(xx,&x)); 1652 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1653 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 /* 1658 This will not work with MatScalar == float because it calls the BLAS 1659 */ 1660 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1661 { 1662 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1663 PetscScalar *z = NULL,*work,*workt,*zarray; 1664 const PetscScalar *x,*xb; 1665 const MatScalar *v; 1666 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1667 const PetscInt *idx,*ii,*ridx=NULL; 1668 PetscInt ncols,k; 1669 PetscBool usecprow=a->compressedrow.use; 1670 1671 PetscFunctionBegin; 1672 PetscCall(VecGetArrayRead(xx,&x)); 1673 PetscCall(VecGetArrayWrite(zz,&zarray)); 1674 1675 idx = a->j; 1676 v = a->a; 1677 if (usecprow) { 1678 mbs = a->compressedrow.nrows; 1679 ii = a->compressedrow.i; 1680 ridx = a->compressedrow.rindex; 1681 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 1682 } else { 1683 mbs = a->mbs; 1684 ii = a->i; 1685 z = zarray; 1686 } 1687 1688 if (!a->mult_work) { 1689 k = PetscMax(A->rmap->n,A->cmap->n); 1690 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 1691 } 1692 work = a->mult_work; 1693 for (i=0; i<mbs; i++) { 1694 n = ii[1] - ii[0]; ii++; 1695 ncols = n*bs; 1696 workt = work; 1697 for (j=0; j<n; j++) { 1698 xb = x + bs*(*idx++); 1699 for (k=0; k<bs; k++) workt[k] = xb[k]; 1700 workt += bs; 1701 } 1702 if (usecprow) z = zarray + bs*ridx[i]; 1703 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1704 v += n*bs2; 1705 if (!usecprow) z += bs; 1706 } 1707 PetscCall(VecRestoreArrayRead(xx,&x)); 1708 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1709 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1714 { 1715 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1716 const PetscScalar *x; 1717 PetscScalar *y,*z,sum; 1718 const MatScalar *v; 1719 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1720 const PetscInt *idx,*ii; 1721 PetscBool usecprow=a->compressedrow.use; 1722 1723 PetscFunctionBegin; 1724 PetscCall(VecGetArrayRead(xx,&x)); 1725 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 1726 1727 idx = a->j; 1728 v = a->a; 1729 if (usecprow) { 1730 if (zz != yy) { 1731 PetscCall(PetscArraycpy(z,y,mbs)); 1732 } 1733 mbs = a->compressedrow.nrows; 1734 ii = a->compressedrow.i; 1735 ridx = a->compressedrow.rindex; 1736 } else { 1737 ii = a->i; 1738 } 1739 1740 for (i=0; i<mbs; i++) { 1741 n = ii[1] - ii[0]; 1742 ii++; 1743 if (!usecprow) { 1744 sum = y[i]; 1745 } else { 1746 sum = y[ridx[i]]; 1747 } 1748 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1749 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1750 PetscSparseDensePlusDot(sum,x,v,idx,n); 1751 v += n; 1752 idx += n; 1753 if (usecprow) { 1754 z[ridx[i]] = sum; 1755 } else { 1756 z[i] = sum; 1757 } 1758 } 1759 PetscCall(VecRestoreArrayRead(xx,&x)); 1760 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 1761 PetscCall(PetscLogFlops(2.0*a->nz)); 1762 PetscFunctionReturn(0); 1763 } 1764 1765 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1766 { 1767 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1768 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1769 const PetscScalar *x,*xb; 1770 PetscScalar x1,x2,*yarray,*zarray; 1771 const MatScalar *v; 1772 PetscInt mbs = a->mbs,i,n,j; 1773 const PetscInt *idx,*ii,*ridx = NULL; 1774 PetscBool usecprow = a->compressedrow.use; 1775 1776 PetscFunctionBegin; 1777 PetscCall(VecGetArrayRead(xx,&x)); 1778 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1779 1780 idx = a->j; 1781 v = a->a; 1782 if (usecprow) { 1783 if (zz != yy) { 1784 PetscCall(PetscArraycpy(zarray,yarray,2*mbs)); 1785 } 1786 mbs = a->compressedrow.nrows; 1787 ii = a->compressedrow.i; 1788 ridx = a->compressedrow.rindex; 1789 } else { 1790 ii = a->i; 1791 y = yarray; 1792 z = zarray; 1793 } 1794 1795 for (i=0; i<mbs; i++) { 1796 n = ii[1] - ii[0]; ii++; 1797 if (usecprow) { 1798 z = zarray + 2*ridx[i]; 1799 y = yarray + 2*ridx[i]; 1800 } 1801 sum1 = y[0]; sum2 = y[1]; 1802 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1803 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1804 for (j=0; j<n; j++) { 1805 xb = x + 2*(*idx++); 1806 x1 = xb[0]; 1807 x2 = xb[1]; 1808 1809 sum1 += v[0]*x1 + v[2]*x2; 1810 sum2 += v[1]*x1 + v[3]*x2; 1811 v += 4; 1812 } 1813 z[0] = sum1; z[1] = sum2; 1814 if (!usecprow) { 1815 z += 2; y += 2; 1816 } 1817 } 1818 PetscCall(VecRestoreArrayRead(xx,&x)); 1819 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1820 PetscCall(PetscLogFlops(4.0*a->nz)); 1821 PetscFunctionReturn(0); 1822 } 1823 1824 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1825 { 1826 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1827 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1828 const PetscScalar *x,*xb; 1829 const MatScalar *v; 1830 PetscInt mbs = a->mbs,i,j,n; 1831 const PetscInt *idx,*ii,*ridx = NULL; 1832 PetscBool usecprow = a->compressedrow.use; 1833 1834 PetscFunctionBegin; 1835 PetscCall(VecGetArrayRead(xx,&x)); 1836 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1837 1838 idx = a->j; 1839 v = a->a; 1840 if (usecprow) { 1841 if (zz != yy) { 1842 PetscCall(PetscArraycpy(zarray,yarray,3*mbs)); 1843 } 1844 mbs = a->compressedrow.nrows; 1845 ii = a->compressedrow.i; 1846 ridx = a->compressedrow.rindex; 1847 } else { 1848 ii = a->i; 1849 y = yarray; 1850 z = zarray; 1851 } 1852 1853 for (i=0; i<mbs; i++) { 1854 n = ii[1] - ii[0]; ii++; 1855 if (usecprow) { 1856 z = zarray + 3*ridx[i]; 1857 y = yarray + 3*ridx[i]; 1858 } 1859 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1860 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1861 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1862 for (j=0; j<n; j++) { 1863 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1864 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1865 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1866 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1867 v += 9; 1868 } 1869 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1870 if (!usecprow) { 1871 z += 3; y += 3; 1872 } 1873 } 1874 PetscCall(VecRestoreArrayRead(xx,&x)); 1875 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1876 PetscCall(PetscLogFlops(18.0*a->nz)); 1877 PetscFunctionReturn(0); 1878 } 1879 1880 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1881 { 1882 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1883 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1884 const PetscScalar *x,*xb; 1885 const MatScalar *v; 1886 PetscInt mbs = a->mbs,i,j,n; 1887 const PetscInt *idx,*ii,*ridx=NULL; 1888 PetscBool usecprow=a->compressedrow.use; 1889 1890 PetscFunctionBegin; 1891 PetscCall(VecGetArrayRead(xx,&x)); 1892 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1893 1894 idx = a->j; 1895 v = a->a; 1896 if (usecprow) { 1897 if (zz != yy) { 1898 PetscCall(PetscArraycpy(zarray,yarray,4*mbs)); 1899 } 1900 mbs = a->compressedrow.nrows; 1901 ii = a->compressedrow.i; 1902 ridx = a->compressedrow.rindex; 1903 } else { 1904 ii = a->i; 1905 y = yarray; 1906 z = zarray; 1907 } 1908 1909 for (i=0; i<mbs; i++) { 1910 n = ii[1] - ii[0]; ii++; 1911 if (usecprow) { 1912 z = zarray + 4*ridx[i]; 1913 y = yarray + 4*ridx[i]; 1914 } 1915 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1916 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1917 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1918 for (j=0; j<n; j++) { 1919 xb = x + 4*(*idx++); 1920 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1921 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1922 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1923 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1924 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1925 v += 16; 1926 } 1927 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1928 if (!usecprow) { 1929 z += 4; y += 4; 1930 } 1931 } 1932 PetscCall(VecRestoreArrayRead(xx,&x)); 1933 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1934 PetscCall(PetscLogFlops(32.0*a->nz)); 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1939 { 1940 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1941 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1942 const PetscScalar *x,*xb; 1943 PetscScalar *yarray,*zarray; 1944 const MatScalar *v; 1945 PetscInt mbs = a->mbs,i,j,n; 1946 const PetscInt *idx,*ii,*ridx = NULL; 1947 PetscBool usecprow=a->compressedrow.use; 1948 1949 PetscFunctionBegin; 1950 PetscCall(VecGetArrayRead(xx,&x)); 1951 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1952 1953 idx = a->j; 1954 v = a->a; 1955 if (usecprow) { 1956 if (zz != yy) { 1957 PetscCall(PetscArraycpy(zarray,yarray,5*mbs)); 1958 } 1959 mbs = a->compressedrow.nrows; 1960 ii = a->compressedrow.i; 1961 ridx = a->compressedrow.rindex; 1962 } else { 1963 ii = a->i; 1964 y = yarray; 1965 z = zarray; 1966 } 1967 1968 for (i=0; i<mbs; i++) { 1969 n = ii[1] - ii[0]; ii++; 1970 if (usecprow) { 1971 z = zarray + 5*ridx[i]; 1972 y = yarray + 5*ridx[i]; 1973 } 1974 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1975 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1976 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1977 for (j=0; j<n; j++) { 1978 xb = x + 5*(*idx++); 1979 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1980 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1981 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1982 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1983 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1984 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1985 v += 25; 1986 } 1987 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1988 if (!usecprow) { 1989 z += 5; y += 5; 1990 } 1991 } 1992 PetscCall(VecRestoreArrayRead(xx,&x)); 1993 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1994 PetscCall(PetscLogFlops(50.0*a->nz)); 1995 PetscFunctionReturn(0); 1996 } 1997 1998 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1999 { 2000 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2001 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 2002 const PetscScalar *x,*xb; 2003 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 2004 const MatScalar *v; 2005 PetscInt mbs = a->mbs,i,j,n; 2006 const PetscInt *idx,*ii,*ridx=NULL; 2007 PetscBool usecprow=a->compressedrow.use; 2008 2009 PetscFunctionBegin; 2010 PetscCall(VecGetArrayRead(xx,&x)); 2011 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2012 2013 idx = a->j; 2014 v = a->a; 2015 if (usecprow) { 2016 if (zz != yy) { 2017 PetscCall(PetscArraycpy(zarray,yarray,6*mbs)); 2018 } 2019 mbs = a->compressedrow.nrows; 2020 ii = a->compressedrow.i; 2021 ridx = a->compressedrow.rindex; 2022 } else { 2023 ii = a->i; 2024 y = yarray; 2025 z = zarray; 2026 } 2027 2028 for (i=0; i<mbs; i++) { 2029 n = ii[1] - ii[0]; ii++; 2030 if (usecprow) { 2031 z = zarray + 6*ridx[i]; 2032 y = yarray + 6*ridx[i]; 2033 } 2034 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2035 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2036 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2037 for (j=0; j<n; j++) { 2038 xb = x + 6*(*idx++); 2039 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2040 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2041 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2042 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2043 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2044 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2045 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2046 v += 36; 2047 } 2048 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2049 if (!usecprow) { 2050 z += 6; y += 6; 2051 } 2052 } 2053 PetscCall(VecRestoreArrayRead(xx,&x)); 2054 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2055 PetscCall(PetscLogFlops(72.0*a->nz)); 2056 PetscFunctionReturn(0); 2057 } 2058 2059 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2060 { 2061 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2062 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2063 const PetscScalar *x,*xb; 2064 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2065 const MatScalar *v; 2066 PetscInt mbs = a->mbs,i,j,n; 2067 const PetscInt *idx,*ii,*ridx = NULL; 2068 PetscBool usecprow=a->compressedrow.use; 2069 2070 PetscFunctionBegin; 2071 PetscCall(VecGetArrayRead(xx,&x)); 2072 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2073 2074 idx = a->j; 2075 v = a->a; 2076 if (usecprow) { 2077 if (zz != yy) { 2078 PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); 2079 } 2080 mbs = a->compressedrow.nrows; 2081 ii = a->compressedrow.i; 2082 ridx = a->compressedrow.rindex; 2083 } else { 2084 ii = a->i; 2085 y = yarray; 2086 z = zarray; 2087 } 2088 2089 for (i=0; i<mbs; i++) { 2090 n = ii[1] - ii[0]; ii++; 2091 if (usecprow) { 2092 z = zarray + 7*ridx[i]; 2093 y = yarray + 7*ridx[i]; 2094 } 2095 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2096 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2097 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2098 for (j=0; j<n; j++) { 2099 xb = x + 7*(*idx++); 2100 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2101 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2102 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2103 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2104 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2105 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2106 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2107 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2108 v += 49; 2109 } 2110 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2111 if (!usecprow) { 2112 z += 7; y += 7; 2113 } 2114 } 2115 PetscCall(VecRestoreArrayRead(xx,&x)); 2116 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2117 PetscCall(PetscLogFlops(98.0*a->nz)); 2118 PetscFunctionReturn(0); 2119 } 2120 2121 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2122 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 2123 { 2124 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2125 PetscScalar *z = NULL,*work,*workt,*zarray; 2126 const PetscScalar *x,*xb; 2127 const MatScalar *v; 2128 PetscInt mbs,i,j,n; 2129 PetscInt k; 2130 PetscBool usecprow=a->compressedrow.use; 2131 const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; 2132 2133 __m256d a0,a1,a2,a3,a4,a5; 2134 __m256d w0,w1,w2,w3; 2135 __m256d z0,z1,z2; 2136 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 2137 2138 PetscFunctionBegin; 2139 PetscCall(VecCopy(yy,zz)); 2140 PetscCall(VecGetArrayRead(xx,&x)); 2141 PetscCall(VecGetArray(zz,&zarray)); 2142 2143 idx = a->j; 2144 v = a->a; 2145 if (usecprow) { 2146 mbs = a->compressedrow.nrows; 2147 ii = a->compressedrow.i; 2148 ridx = a->compressedrow.rindex; 2149 } else { 2150 mbs = a->mbs; 2151 ii = a->i; 2152 z = zarray; 2153 } 2154 2155 if (!a->mult_work) { 2156 k = PetscMax(A->rmap->n,A->cmap->n); 2157 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2158 } 2159 2160 work = a->mult_work; 2161 for (i=0; i<mbs; i++) { 2162 n = ii[1] - ii[0]; ii++; 2163 workt = work; 2164 for (j=0; j<n; j++) { 2165 xb = x + bs*(*idx++); 2166 for (k=0; k<bs; k++) workt[k] = xb[k]; 2167 workt += bs; 2168 } 2169 if (usecprow) z = zarray + bs*ridx[i]; 2170 2171 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 2172 2173 for (j=0; j<n; j++) { 2174 /* first column of a */ 2175 w0 = _mm256_set1_pd(work[j*9 ]); 2176 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2177 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2178 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 2179 2180 /* second column of a */ 2181 w1 = _mm256_set1_pd(work[j*9+ 1]); 2182 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2183 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2184 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2185 2186 /* third column of a */ 2187 w2 = _mm256_set1_pd(work[j*9+ 2]); 2188 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 2189 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 2190 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 2191 2192 /* fourth column of a */ 2193 w3 = _mm256_set1_pd(work[j*9+ 3]); 2194 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 2195 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 2196 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 2197 2198 /* fifth column of a */ 2199 w0 = _mm256_set1_pd(work[j*9+ 4]); 2200 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 2201 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 2202 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 2203 2204 /* sixth column of a */ 2205 w1 = _mm256_set1_pd(work[j*9+ 5]); 2206 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2207 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2208 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2209 2210 /* seventh column of a */ 2211 w2 = _mm256_set1_pd(work[j*9+ 6]); 2212 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 2213 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 2214 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 2215 2216 /* eighth column of a */ 2217 w3 = _mm256_set1_pd(work[j*9+ 7]); 2218 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 2219 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 2220 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 2221 2222 /* ninth column of a */ 2223 w0 = _mm256_set1_pd(work[j*9+ 8]); 2224 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2225 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2226 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 2227 } 2228 2229 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 2230 2231 v += n*bs2; 2232 if (!usecprow) z += bs; 2233 } 2234 PetscCall(VecRestoreArrayRead(xx,&x)); 2235 PetscCall(VecRestoreArray(zz,&zarray)); 2236 PetscCall(PetscLogFlops(162.0*a->nz)); 2237 PetscFunctionReturn(0); 2238 } 2239 #endif 2240 2241 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 2242 { 2243 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2244 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 2245 const PetscScalar *x,*xb; 2246 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 2247 const MatScalar *v; 2248 PetscInt mbs = a->mbs,i,j,n; 2249 const PetscInt *idx,*ii,*ridx = NULL; 2250 PetscBool usecprow=a->compressedrow.use; 2251 2252 PetscFunctionBegin; 2253 PetscCall(VecGetArrayRead(xx,&x)); 2254 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2255 2256 idx = a->j; 2257 v = a->a; 2258 if (usecprow) { 2259 if (zz != yy) { 2260 PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); 2261 } 2262 mbs = a->compressedrow.nrows; 2263 ii = a->compressedrow.i; 2264 ridx = a->compressedrow.rindex; 2265 } else { 2266 ii = a->i; 2267 y = yarray; 2268 z = zarray; 2269 } 2270 2271 for (i=0; i<mbs; i++) { 2272 n = ii[1] - ii[0]; ii++; 2273 if (usecprow) { 2274 z = zarray + 11*ridx[i]; 2275 y = yarray + 11*ridx[i]; 2276 } 2277 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2278 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 2279 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2280 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2281 for (j=0; j<n; j++) { 2282 xb = x + 11*(*idx++); 2283 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 2284 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 2285 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 2286 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 2287 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 2288 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 2289 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 2290 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 2291 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 2292 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 2293 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 2294 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 2295 v += 121; 2296 } 2297 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2298 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 2299 if (!usecprow) { 2300 z += 11; y += 11; 2301 } 2302 } 2303 PetscCall(VecRestoreArrayRead(xx,&x)); 2304 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2305 PetscCall(PetscLogFlops(242.0*a->nz)); 2306 PetscFunctionReturn(0); 2307 } 2308 2309 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2310 { 2311 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2312 PetscScalar *z = NULL,*work,*workt,*zarray; 2313 const PetscScalar *x,*xb; 2314 const MatScalar *v; 2315 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2316 PetscInt ncols,k; 2317 const PetscInt *ridx = NULL,*idx,*ii; 2318 PetscBool usecprow = a->compressedrow.use; 2319 2320 PetscFunctionBegin; 2321 PetscCall(VecCopy(yy,zz)); 2322 PetscCall(VecGetArrayRead(xx,&x)); 2323 PetscCall(VecGetArray(zz,&zarray)); 2324 2325 idx = a->j; 2326 v = a->a; 2327 if (usecprow) { 2328 mbs = a->compressedrow.nrows; 2329 ii = a->compressedrow.i; 2330 ridx = a->compressedrow.rindex; 2331 } else { 2332 mbs = a->mbs; 2333 ii = a->i; 2334 z = zarray; 2335 } 2336 2337 if (!a->mult_work) { 2338 k = PetscMax(A->rmap->n,A->cmap->n); 2339 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2340 } 2341 work = a->mult_work; 2342 for (i=0; i<mbs; i++) { 2343 n = ii[1] - ii[0]; ii++; 2344 ncols = n*bs; 2345 workt = work; 2346 for (j=0; j<n; j++) { 2347 xb = x + bs*(*idx++); 2348 for (k=0; k<bs; k++) workt[k] = xb[k]; 2349 workt += bs; 2350 } 2351 if (usecprow) z = zarray + bs*ridx[i]; 2352 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2353 v += n*bs2; 2354 if (!usecprow) z += bs; 2355 } 2356 PetscCall(VecRestoreArrayRead(xx,&x)); 2357 PetscCall(VecRestoreArray(zz,&zarray)); 2358 PetscCall(PetscLogFlops(2.0*a->nz*bs2)); 2359 PetscFunctionReturn(0); 2360 } 2361 2362 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2363 { 2364 PetscScalar zero = 0.0; 2365 2366 PetscFunctionBegin; 2367 PetscCall(VecSet(zz,zero)); 2368 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz)); 2369 PetscFunctionReturn(0); 2370 } 2371 2372 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2373 { 2374 PetscScalar zero = 0.0; 2375 2376 PetscFunctionBegin; 2377 PetscCall(VecSet(zz,zero)); 2378 PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz)); 2379 PetscFunctionReturn(0); 2380 } 2381 2382 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2383 { 2384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2385 PetscScalar *z,x1,x2,x3,x4,x5; 2386 const PetscScalar *x,*xb = NULL; 2387 const MatScalar *v; 2388 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2389 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2390 Mat_CompressedRow cprow = a->compressedrow; 2391 PetscBool usecprow = cprow.use; 2392 2393 PetscFunctionBegin; 2394 if (yy != zz) PetscCall(VecCopy(yy,zz)); 2395 PetscCall(VecGetArrayRead(xx,&x)); 2396 PetscCall(VecGetArray(zz,&z)); 2397 2398 idx = a->j; 2399 v = a->a; 2400 if (usecprow) { 2401 mbs = cprow.nrows; 2402 ii = cprow.i; 2403 ridx = cprow.rindex; 2404 } else { 2405 mbs=a->mbs; 2406 ii = a->i; 2407 xb = x; 2408 } 2409 2410 switch (bs) { 2411 case 1: 2412 for (i=0; i<mbs; i++) { 2413 if (usecprow) xb = x + ridx[i]; 2414 x1 = xb[0]; 2415 ib = idx + ii[0]; 2416 n = ii[1] - ii[0]; ii++; 2417 for (j=0; j<n; j++) { 2418 rval = ib[j]; 2419 z[rval] += PetscConj(*v) * x1; 2420 v++; 2421 } 2422 if (!usecprow) xb++; 2423 } 2424 break; 2425 case 2: 2426 for (i=0; i<mbs; i++) { 2427 if (usecprow) xb = x + 2*ridx[i]; 2428 x1 = xb[0]; x2 = xb[1]; 2429 ib = idx + ii[0]; 2430 n = ii[1] - ii[0]; ii++; 2431 for (j=0; j<n; j++) { 2432 rval = ib[j]*2; 2433 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2434 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2435 v += 4; 2436 } 2437 if (!usecprow) xb += 2; 2438 } 2439 break; 2440 case 3: 2441 for (i=0; i<mbs; i++) { 2442 if (usecprow) xb = x + 3*ridx[i]; 2443 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2444 ib = idx + ii[0]; 2445 n = ii[1] - ii[0]; ii++; 2446 for (j=0; j<n; j++) { 2447 rval = ib[j]*3; 2448 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2449 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2450 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2451 v += 9; 2452 } 2453 if (!usecprow) xb += 3; 2454 } 2455 break; 2456 case 4: 2457 for (i=0; i<mbs; i++) { 2458 if (usecprow) xb = x + 4*ridx[i]; 2459 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2460 ib = idx + ii[0]; 2461 n = ii[1] - ii[0]; ii++; 2462 for (j=0; j<n; j++) { 2463 rval = ib[j]*4; 2464 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2465 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2466 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2467 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2468 v += 16; 2469 } 2470 if (!usecprow) xb += 4; 2471 } 2472 break; 2473 case 5: 2474 for (i=0; i<mbs; i++) { 2475 if (usecprow) xb = x + 5*ridx[i]; 2476 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2477 x4 = xb[3]; x5 = xb[4]; 2478 ib = idx + ii[0]; 2479 n = ii[1] - ii[0]; ii++; 2480 for (j=0; j<n; j++) { 2481 rval = ib[j]*5; 2482 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2483 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2484 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2485 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2486 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2487 v += 25; 2488 } 2489 if (!usecprow) xb += 5; 2490 } 2491 break; 2492 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2493 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2494 #if 0 2495 { 2496 PetscInt ncols,k,bs2=a->bs2; 2497 PetscScalar *work,*workt,zb; 2498 const PetscScalar *xtmp; 2499 if (!a->mult_work) { 2500 k = PetscMax(A->rmap->n,A->cmap->n); 2501 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2502 } 2503 work = a->mult_work; 2504 xtmp = x; 2505 for (i=0; i<mbs; i++) { 2506 n = ii[1] - ii[0]; ii++; 2507 ncols = n*bs; 2508 PetscCall(PetscArrayzero(work,ncols)); 2509 if (usecprow) xtmp = x + bs*ridx[i]; 2510 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2511 v += n*bs2; 2512 if (!usecprow) xtmp += bs; 2513 workt = work; 2514 for (j=0; j<n; j++) { 2515 zb = z + bs*(*idx++); 2516 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2517 workt += bs; 2518 } 2519 } 2520 } 2521 #endif 2522 } 2523 PetscCall(VecRestoreArrayRead(xx,&x)); 2524 PetscCall(VecRestoreArray(zz,&z)); 2525 PetscCall(PetscLogFlops(2.0*a->nz*a->bs2)); 2526 PetscFunctionReturn(0); 2527 } 2528 2529 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2530 { 2531 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2532 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2533 const PetscScalar *x,*xb = NULL; 2534 const MatScalar *v; 2535 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2536 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2537 Mat_CompressedRow cprow = a->compressedrow; 2538 PetscBool usecprow=cprow.use; 2539 2540 PetscFunctionBegin; 2541 if (yy != zz) PetscCall(VecCopy(yy,zz)); 2542 PetscCall(VecGetArrayRead(xx,&x)); 2543 PetscCall(VecGetArray(zz,&z)); 2544 2545 idx = a->j; 2546 v = a->a; 2547 if (usecprow) { 2548 mbs = cprow.nrows; 2549 ii = cprow.i; 2550 ridx = cprow.rindex; 2551 } else { 2552 mbs=a->mbs; 2553 ii = a->i; 2554 xb = x; 2555 } 2556 2557 switch (bs) { 2558 case 1: 2559 for (i=0; i<mbs; i++) { 2560 if (usecprow) xb = x + ridx[i]; 2561 x1 = xb[0]; 2562 ib = idx + ii[0]; 2563 n = ii[1] - ii[0]; ii++; 2564 for (j=0; j<n; j++) { 2565 rval = ib[j]; 2566 z[rval] += *v * x1; 2567 v++; 2568 } 2569 if (!usecprow) xb++; 2570 } 2571 break; 2572 case 2: 2573 for (i=0; i<mbs; i++) { 2574 if (usecprow) xb = x + 2*ridx[i]; 2575 x1 = xb[0]; x2 = xb[1]; 2576 ib = idx + ii[0]; 2577 n = ii[1] - ii[0]; ii++; 2578 for (j=0; j<n; j++) { 2579 rval = ib[j]*2; 2580 z[rval++] += v[0]*x1 + v[1]*x2; 2581 z[rval++] += v[2]*x1 + v[3]*x2; 2582 v += 4; 2583 } 2584 if (!usecprow) xb += 2; 2585 } 2586 break; 2587 case 3: 2588 for (i=0; i<mbs; i++) { 2589 if (usecprow) xb = x + 3*ridx[i]; 2590 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2591 ib = idx + ii[0]; 2592 n = ii[1] - ii[0]; ii++; 2593 for (j=0; j<n; j++) { 2594 rval = ib[j]*3; 2595 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2596 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2597 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2598 v += 9; 2599 } 2600 if (!usecprow) xb += 3; 2601 } 2602 break; 2603 case 4: 2604 for (i=0; i<mbs; i++) { 2605 if (usecprow) xb = x + 4*ridx[i]; 2606 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2607 ib = idx + ii[0]; 2608 n = ii[1] - ii[0]; ii++; 2609 for (j=0; j<n; j++) { 2610 rval = ib[j]*4; 2611 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2612 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2613 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2614 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2615 v += 16; 2616 } 2617 if (!usecprow) xb += 4; 2618 } 2619 break; 2620 case 5: 2621 for (i=0; i<mbs; i++) { 2622 if (usecprow) xb = x + 5*ridx[i]; 2623 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2624 x4 = xb[3]; x5 = xb[4]; 2625 ib = idx + ii[0]; 2626 n = ii[1] - ii[0]; ii++; 2627 for (j=0; j<n; j++) { 2628 rval = ib[j]*5; 2629 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2630 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2631 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2632 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2633 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2634 v += 25; 2635 } 2636 if (!usecprow) xb += 5; 2637 } 2638 break; 2639 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2640 PetscInt ncols,k; 2641 PetscScalar *work,*workt; 2642 const PetscScalar *xtmp; 2643 if (!a->mult_work) { 2644 k = PetscMax(A->rmap->n,A->cmap->n); 2645 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2646 } 2647 work = a->mult_work; 2648 xtmp = x; 2649 for (i=0; i<mbs; i++) { 2650 n = ii[1] - ii[0]; ii++; 2651 ncols = n*bs; 2652 PetscCall(PetscArrayzero(work,ncols)); 2653 if (usecprow) xtmp = x + bs*ridx[i]; 2654 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2655 v += n*bs2; 2656 if (!usecprow) xtmp += bs; 2657 workt = work; 2658 for (j=0; j<n; j++) { 2659 zb = z + bs*(*idx++); 2660 for (k=0; k<bs; k++) zb[k] += workt[k]; 2661 workt += bs; 2662 } 2663 } 2664 } 2665 } 2666 PetscCall(VecRestoreArrayRead(xx,&x)); 2667 PetscCall(VecRestoreArray(zz,&z)); 2668 PetscCall(PetscLogFlops(2.0*a->nz*a->bs2)); 2669 PetscFunctionReturn(0); 2670 } 2671 2672 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2673 { 2674 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2675 PetscInt totalnz = a->bs2*a->nz; 2676 PetscScalar oalpha = alpha; 2677 PetscBLASInt one = 1,tnz; 2678 2679 PetscFunctionBegin; 2680 PetscCall(PetscBLASIntCast(totalnz,&tnz)); 2681 PetscCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2682 PetscCall(PetscLogFlops(totalnz)); 2683 PetscFunctionReturn(0); 2684 } 2685 2686 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2687 { 2688 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2689 MatScalar *v = a->a; 2690 PetscReal sum = 0.0; 2691 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2692 2693 PetscFunctionBegin; 2694 if (type == NORM_FROBENIUS) { 2695 #if defined(PETSC_USE_REAL___FP16) 2696 PetscBLASInt one = 1,cnt = bs2*nz; 2697 PetscCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2698 #else 2699 for (i=0; i<bs2*nz; i++) { 2700 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2701 } 2702 #endif 2703 *norm = PetscSqrtReal(sum); 2704 PetscCall(PetscLogFlops(2.0*bs2*nz)); 2705 } else if (type == NORM_1) { /* maximum column sum */ 2706 PetscReal *tmp; 2707 PetscInt *bcol = a->j; 2708 PetscCall(PetscCalloc1(A->cmap->n+1,&tmp)); 2709 for (i=0; i<nz; i++) { 2710 for (j=0; j<bs; j++) { 2711 k1 = bs*(*bcol) + j; /* column index */ 2712 for (k=0; k<bs; k++) { 2713 tmp[k1] += PetscAbsScalar(*v); v++; 2714 } 2715 } 2716 bcol++; 2717 } 2718 *norm = 0.0; 2719 for (j=0; j<A->cmap->n; j++) { 2720 if (tmp[j] > *norm) *norm = tmp[j]; 2721 } 2722 PetscCall(PetscFree(tmp)); 2723 PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); 2724 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2725 *norm = 0.0; 2726 for (k=0; k<bs; k++) { 2727 for (j=0; j<a->mbs; j++) { 2728 v = a->a + bs2*a->i[j] + k; 2729 sum = 0.0; 2730 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2731 for (k1=0; k1<bs; k1++) { 2732 sum += PetscAbsScalar(*v); 2733 v += bs; 2734 } 2735 } 2736 if (sum > *norm) *norm = sum; 2737 } 2738 } 2739 PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); 2740 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2741 PetscFunctionReturn(0); 2742 } 2743 2744 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2745 { 2746 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2747 2748 PetscFunctionBegin; 2749 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2750 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2751 *flg = PETSC_FALSE; 2752 PetscFunctionReturn(0); 2753 } 2754 2755 /* if the a->i are the same */ 2756 PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg)); 2757 if (!*flg) PetscFunctionReturn(0); 2758 2759 /* if a->j are the same */ 2760 PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg)); 2761 if (!*flg) PetscFunctionReturn(0); 2762 2763 /* if a->a are the same */ 2764 PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg)); 2765 PetscFunctionReturn(0); 2766 2767 } 2768 2769 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2770 { 2771 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2772 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2773 PetscScalar *x,zero = 0.0; 2774 MatScalar *aa,*aa_j; 2775 2776 PetscFunctionBegin; 2777 PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2778 bs = A->rmap->bs; 2779 aa = a->a; 2780 ai = a->i; 2781 aj = a->j; 2782 ambs = a->mbs; 2783 bs2 = a->bs2; 2784 2785 PetscCall(VecSet(v,zero)); 2786 PetscCall(VecGetArray(v,&x)); 2787 PetscCall(VecGetLocalSize(v,&n)); 2788 PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2789 for (i=0; i<ambs; i++) { 2790 for (j=ai[i]; j<ai[i+1]; j++) { 2791 if (aj[j] == i) { 2792 row = i*bs; 2793 aa_j = aa+j*bs2; 2794 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2795 break; 2796 } 2797 } 2798 } 2799 PetscCall(VecRestoreArray(v,&x)); 2800 PetscFunctionReturn(0); 2801 } 2802 2803 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2804 { 2805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2806 const PetscScalar *l,*r,*li,*ri; 2807 PetscScalar x; 2808 MatScalar *aa, *v; 2809 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2810 const PetscInt *ai,*aj; 2811 2812 PetscFunctionBegin; 2813 ai = a->i; 2814 aj = a->j; 2815 aa = a->a; 2816 m = A->rmap->n; 2817 n = A->cmap->n; 2818 bs = A->rmap->bs; 2819 mbs = a->mbs; 2820 bs2 = a->bs2; 2821 if (ll) { 2822 PetscCall(VecGetArrayRead(ll,&l)); 2823 PetscCall(VecGetLocalSize(ll,&lm)); 2824 PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2825 for (i=0; i<mbs; i++) { /* for each block row */ 2826 M = ai[i+1] - ai[i]; 2827 li = l + i*bs; 2828 v = aa + bs2*ai[i]; 2829 for (j=0; j<M; j++) { /* for each block */ 2830 for (k=0; k<bs2; k++) { 2831 (*v++) *= li[k%bs]; 2832 } 2833 } 2834 } 2835 PetscCall(VecRestoreArrayRead(ll,&l)); 2836 PetscCall(PetscLogFlops(a->nz)); 2837 } 2838 2839 if (rr) { 2840 PetscCall(VecGetArrayRead(rr,&r)); 2841 PetscCall(VecGetLocalSize(rr,&rn)); 2842 PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2843 for (i=0; i<mbs; i++) { /* for each block row */ 2844 iai = ai[i]; 2845 M = ai[i+1] - iai; 2846 v = aa + bs2*iai; 2847 for (j=0; j<M; j++) { /* for each block */ 2848 ri = r + bs*aj[iai+j]; 2849 for (k=0; k<bs; k++) { 2850 x = ri[k]; 2851 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2852 v += bs; 2853 } 2854 } 2855 } 2856 PetscCall(VecRestoreArrayRead(rr,&r)); 2857 PetscCall(PetscLogFlops(a->nz)); 2858 } 2859 PetscFunctionReturn(0); 2860 } 2861 2862 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2863 { 2864 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2865 2866 PetscFunctionBegin; 2867 info->block_size = a->bs2; 2868 info->nz_allocated = a->bs2*a->maxnz; 2869 info->nz_used = a->bs2*a->nz; 2870 info->nz_unneeded = info->nz_allocated - info->nz_used; 2871 info->assemblies = A->num_ass; 2872 info->mallocs = A->info.mallocs; 2873 info->memory = ((PetscObject)A)->mem; 2874 if (A->factortype) { 2875 info->fill_ratio_given = A->info.fill_ratio_given; 2876 info->fill_ratio_needed = A->info.fill_ratio_needed; 2877 info->factor_mallocs = A->info.factor_mallocs; 2878 } else { 2879 info->fill_ratio_given = 0; 2880 info->fill_ratio_needed = 0; 2881 info->factor_mallocs = 0; 2882 } 2883 PetscFunctionReturn(0); 2884 } 2885 2886 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2887 { 2888 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2889 2890 PetscFunctionBegin; 2891 PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs])); 2892 PetscFunctionReturn(0); 2893 } 2894 2895 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2896 { 2897 PetscFunctionBegin; 2898 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 2899 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2900 PetscFunctionReturn(0); 2901 } 2902 2903 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2904 { 2905 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2906 PetscScalar *z = NULL,sum1; 2907 const PetscScalar *xb; 2908 PetscScalar x1; 2909 const MatScalar *v,*vv; 2910 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2911 PetscBool usecprow=a->compressedrow.use; 2912 2913 PetscFunctionBegin; 2914 idx = a->j; 2915 v = a->a; 2916 if (usecprow) { 2917 mbs = a->compressedrow.nrows; 2918 ii = a->compressedrow.i; 2919 ridx = a->compressedrow.rindex; 2920 } else { 2921 mbs = a->mbs; 2922 ii = a->i; 2923 z = c; 2924 } 2925 2926 for (i=0; i<mbs; i++) { 2927 n = ii[1] - ii[0]; ii++; 2928 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2929 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2930 if (usecprow) z = c + ridx[i]; 2931 jj = idx; 2932 vv = v; 2933 for (k=0; k<cn; k++) { 2934 idx = jj; 2935 v = vv; 2936 sum1 = 0.0; 2937 for (j=0; j<n; j++) { 2938 xb = b + (*idx++); x1 = xb[0+k*bm]; 2939 sum1 += v[0]*x1; 2940 v += 1; 2941 } 2942 z[0+k*cm] = sum1; 2943 } 2944 if (!usecprow) z += 1; 2945 } 2946 PetscFunctionReturn(0); 2947 } 2948 2949 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2950 { 2951 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2952 PetscScalar *z = NULL,sum1,sum2; 2953 const PetscScalar *xb; 2954 PetscScalar x1,x2; 2955 const MatScalar *v,*vv; 2956 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2957 PetscBool usecprow=a->compressedrow.use; 2958 2959 PetscFunctionBegin; 2960 idx = a->j; 2961 v = a->a; 2962 if (usecprow) { 2963 mbs = a->compressedrow.nrows; 2964 ii = a->compressedrow.i; 2965 ridx = a->compressedrow.rindex; 2966 } else { 2967 mbs = a->mbs; 2968 ii = a->i; 2969 z = c; 2970 } 2971 2972 for (i=0; i<mbs; i++) { 2973 n = ii[1] - ii[0]; ii++; 2974 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2975 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2976 if (usecprow) z = c + 2*ridx[i]; 2977 jj = idx; 2978 vv = v; 2979 for (k=0; k<cn; k++) { 2980 idx = jj; 2981 v = vv; 2982 sum1 = 0.0; sum2 = 0.0; 2983 for (j=0; j<n; j++) { 2984 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 2985 sum1 += v[0]*x1 + v[2]*x2; 2986 sum2 += v[1]*x1 + v[3]*x2; 2987 v += 4; 2988 } 2989 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 2990 } 2991 if (!usecprow) z += 2; 2992 } 2993 PetscFunctionReturn(0); 2994 } 2995 2996 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2997 { 2998 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2999 PetscScalar *z = NULL,sum1,sum2,sum3; 3000 const PetscScalar *xb; 3001 PetscScalar x1,x2,x3; 3002 const MatScalar *v,*vv; 3003 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3004 PetscBool usecprow=a->compressedrow.use; 3005 3006 PetscFunctionBegin; 3007 idx = a->j; 3008 v = a->a; 3009 if (usecprow) { 3010 mbs = a->compressedrow.nrows; 3011 ii = a->compressedrow.i; 3012 ridx = a->compressedrow.rindex; 3013 } else { 3014 mbs = a->mbs; 3015 ii = a->i; 3016 z = c; 3017 } 3018 3019 for (i=0; i<mbs; i++) { 3020 n = ii[1] - ii[0]; ii++; 3021 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3022 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3023 if (usecprow) z = c + 3*ridx[i]; 3024 jj = idx; 3025 vv = v; 3026 for (k=0; k<cn; k++) { 3027 idx = jj; 3028 v = vv; 3029 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 3030 for (j=0; j<n; j++) { 3031 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 3032 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3033 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3034 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3035 v += 9; 3036 } 3037 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 3038 } 3039 if (!usecprow) z += 3; 3040 } 3041 PetscFunctionReturn(0); 3042 } 3043 3044 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3045 { 3046 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3047 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 3048 const PetscScalar *xb; 3049 PetscScalar x1,x2,x3,x4; 3050 const MatScalar *v,*vv; 3051 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3052 PetscBool usecprow=a->compressedrow.use; 3053 3054 PetscFunctionBegin; 3055 idx = a->j; 3056 v = a->a; 3057 if (usecprow) { 3058 mbs = a->compressedrow.nrows; 3059 ii = a->compressedrow.i; 3060 ridx = a->compressedrow.rindex; 3061 } else { 3062 mbs = a->mbs; 3063 ii = a->i; 3064 z = c; 3065 } 3066 3067 for (i=0; i<mbs; i++) { 3068 n = ii[1] - ii[0]; ii++; 3069 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3070 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3071 if (usecprow) z = c + 4*ridx[i]; 3072 jj = idx; 3073 vv = v; 3074 for (k=0; k<cn; k++) { 3075 idx = jj; 3076 v = vv; 3077 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 3078 for (j=0; j<n; j++) { 3079 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 3080 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3081 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3082 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3083 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3084 v += 16; 3085 } 3086 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 3087 } 3088 if (!usecprow) z += 4; 3089 } 3090 PetscFunctionReturn(0); 3091 } 3092 3093 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3094 { 3095 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3096 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 3097 const PetscScalar *xb; 3098 PetscScalar x1,x2,x3,x4,x5; 3099 const MatScalar *v,*vv; 3100 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3101 PetscBool usecprow=a->compressedrow.use; 3102 3103 PetscFunctionBegin; 3104 idx = a->j; 3105 v = a->a; 3106 if (usecprow) { 3107 mbs = a->compressedrow.nrows; 3108 ii = a->compressedrow.i; 3109 ridx = a->compressedrow.rindex; 3110 } else { 3111 mbs = a->mbs; 3112 ii = a->i; 3113 z = c; 3114 } 3115 3116 for (i=0; i<mbs; i++) { 3117 n = ii[1] - ii[0]; ii++; 3118 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3119 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3120 if (usecprow) z = c + 5*ridx[i]; 3121 jj = idx; 3122 vv = v; 3123 for (k=0; k<cn; k++) { 3124 idx = jj; 3125 v = vv; 3126 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 3127 for (j=0; j<n; j++) { 3128 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 3129 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3130 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3131 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3132 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3133 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3134 v += 25; 3135 } 3136 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 3137 } 3138 if (!usecprow) z += 5; 3139 } 3140 PetscFunctionReturn(0); 3141 } 3142 3143 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 3144 { 3145 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3146 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 3147 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 3148 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 3149 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 3150 PetscBLASInt bbs,bcn,bbm,bcm; 3151 PetscScalar *z = NULL; 3152 PetscScalar *c,*b; 3153 const MatScalar *v; 3154 const PetscInt *idx,*ii,*ridx=NULL; 3155 PetscScalar _DZero=0.0,_DOne=1.0; 3156 PetscBool usecprow=a->compressedrow.use; 3157 3158 PetscFunctionBegin; 3159 if (!cm || !cn) PetscFunctionReturn(0); 3160 PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 3161 PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 3162 PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 3163 b = bd->v; 3164 if (a->nonzerorowcnt != A->rmap->n) { 3165 PetscCall(MatZeroEntries(C)); 3166 } 3167 PetscCall(MatDenseGetArray(C,&c)); 3168 switch (bs) { 3169 case 1: 3170 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3171 break; 3172 case 2: 3173 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3174 break; 3175 case 3: 3176 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3177 break; 3178 case 4: 3179 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3180 break; 3181 case 5: 3182 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3183 break; 3184 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3185 PetscCall(PetscBLASIntCast(bs,&bbs)); 3186 PetscCall(PetscBLASIntCast(cn,&bcn)); 3187 PetscCall(PetscBLASIntCast(bm,&bbm)); 3188 PetscCall(PetscBLASIntCast(cm,&bcm)); 3189 idx = a->j; 3190 v = a->a; 3191 if (usecprow) { 3192 mbs = a->compressedrow.nrows; 3193 ii = a->compressedrow.i; 3194 ridx = a->compressedrow.rindex; 3195 } else { 3196 mbs = a->mbs; 3197 ii = a->i; 3198 z = c; 3199 } 3200 for (i=0; i<mbs; i++) { 3201 n = ii[1] - ii[0]; ii++; 3202 if (usecprow) z = c + bs*ridx[i]; 3203 if (n) { 3204 PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 3205 v += bs2; 3206 } 3207 for (j=1; j<n; j++) { 3208 PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 3209 v += bs2; 3210 } 3211 if (!usecprow) z += bs; 3212 } 3213 } 3214 PetscCall(MatDenseRestoreArray(C,&c)); 3215 PetscCall(PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn)); 3216 PetscFunctionReturn(0); 3217 } 3218