1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 14 const PetscInt *idx; 15 PetscInt start, end, *ai, *aj, bs, *nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 25 26 PetscCall(PetscBTCreate(m, &table)); 27 PetscCall(PetscMalloc1(m + 1, &nidx)); 28 PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 29 30 for (i = 0; i < is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 PetscCall(PetscBTMemzero(m, table)); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 PetscCall(ISGetIndices(is[i], &idx)); 37 PetscCall(ISGetLocalSize(is[i], &n)); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j = 0; j < n; ++j) { 41 ival = idx[j] / bs; /* convert the indices into block indices */ 42 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 43 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44 } 45 PetscCall(ISRestoreIndices(is[i], &idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j = 0; j < ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row + 1]; 55 for (l = start; l < end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j = 0; j < isz; j++) { 63 for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 64 } 65 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 66 } 67 PetscCall(PetscBTDestroy(&table)); 68 PetscCall(PetscFree(nidx)); 69 PetscCall(PetscFree(nidx2)); 70 PetscFunctionReturn(0); 71 } 72 73 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 74 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 75 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 76 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 77 const PetscInt *irow, *icol; 78 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 79 PetscInt *aj = a->j, *ai = a->i; 80 MatScalar *mat_a; 81 Mat C; 82 PetscBool flag; 83 84 PetscFunctionBegin; 85 PetscCall(ISGetIndices(isrow, &irow)); 86 PetscCall(ISGetIndices(iscol, &icol)); 87 PetscCall(ISGetLocalSize(isrow, &nrows)); 88 PetscCall(ISGetLocalSize(iscol, &ncols)); 89 90 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 91 ssmap = smap; 92 PetscCall(PetscMalloc1(1 + nrows, &lens)); 93 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 94 /* determine lens of each row */ 95 for (i = 0; i < nrows; i++) { 96 kstart = ai[irow[i]]; 97 kend = kstart + a->ilen[irow[i]]; 98 lens[i] = 0; 99 for (k = kstart; k < kend; k++) { 100 if (ssmap[aj[k]]) lens[i]++; 101 } 102 } 103 /* Create and fill new matrix */ 104 if (scall == MAT_REUSE_MATRIX) { 105 c = (Mat_SeqBAIJ *)((*B)->data); 106 107 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 108 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 109 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 110 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 111 C = *B; 112 } else { 113 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 114 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 115 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 116 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 117 } 118 c = (Mat_SeqBAIJ *)(C->data); 119 for (i = 0; i < nrows; i++) { 120 row = irow[i]; 121 kstart = ai[row]; 122 kend = kstart + a->ilen[row]; 123 mat_i = c->i[i]; 124 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 125 mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 126 mat_ilen = c->ilen + i; 127 for (k = kstart; k < kend; k++) { 128 if ((tcol = ssmap[a->j[k]])) { 129 *mat_j++ = tcol - 1; 130 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 131 mat_a += bs2; 132 (*mat_ilen)++; 133 } 134 } 135 } 136 /* sort */ 137 if (c->j && c->a) { 138 MatScalar *work; 139 PetscCall(PetscMalloc1(bs2, &work)); 140 for (i = 0; i < nrows; i++) { 141 PetscInt ilen; 142 mat_i = c->i[i]; 143 mat_j = c->j + mat_i; 144 mat_a = c->a + mat_i * bs2; 145 ilen = c->ilen[i]; 146 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 147 } 148 PetscCall(PetscFree(work)); 149 } 150 151 /* Free work space */ 152 PetscCall(ISRestoreIndices(iscol, &icol)); 153 PetscCall(PetscFree(smap)); 154 PetscCall(PetscFree(lens)); 155 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 156 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 157 158 PetscCall(ISRestoreIndices(isrow, &irow)); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 164 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 165 IS is1, is2; 166 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 167 const PetscInt *irow, *icol; 168 169 PetscFunctionBegin; 170 PetscCall(ISGetIndices(isrow, &irow)); 171 PetscCall(ISGetIndices(iscol, &icol)); 172 PetscCall(ISGetLocalSize(isrow, &nrows)); 173 PetscCall(ISGetLocalSize(iscol, &ncols)); 174 175 /* Verify if the indices corespond to each element in a block 176 and form the IS with compressed IS */ 177 maxmnbs = PetscMax(a->mbs, a->nbs); 178 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 179 PetscCall(PetscArrayzero(vary, a->mbs)); 180 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 181 for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); } 182 count = 0; 183 for (i = 0; i < nrows; i++) { 184 j = irow[i] / bs; 185 if ((vary[j]--) == bs) iary[count++] = j; 186 } 187 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 188 189 PetscCall(PetscArrayzero(vary, a->nbs)); 190 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 191 for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); } 192 count = 0; 193 for (i = 0; i < ncols; i++) { 194 j = icol[i] / bs; 195 if ((vary[j]--) == bs) iary[count++] = j; 196 } 197 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 198 PetscCall(ISRestoreIndices(isrow, &irow)); 199 PetscCall(ISRestoreIndices(iscol, &icol)); 200 PetscCall(PetscFree2(vary, iary)); 201 202 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 203 PetscCall(ISDestroy(&is1)); 204 PetscCall(ISDestroy(&is2)); 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) { 209 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 210 Mat_SubSppt *submatj = c->submatis1; 211 212 PetscFunctionBegin; 213 PetscCall((*submatj->destroy)(C)); 214 PetscCall(MatDestroySubMatrix_Private(submatj)); 215 PetscFunctionReturn(0); 216 } 217 218 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 219 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) { 220 PetscInt i; 221 Mat C; 222 Mat_SeqBAIJ *c; 223 Mat_SubSppt *submatj; 224 225 PetscFunctionBegin; 226 for (i = 0; i < n; i++) { 227 C = (*mat)[i]; 228 c = (Mat_SeqBAIJ *)C->data; 229 submatj = c->submatis1; 230 if (submatj) { 231 if (--((PetscObject)C)->refct <= 0) { 232 PetscCall(PetscFree(C->factorprefix)); 233 PetscCall((*submatj->destroy)(C)); 234 PetscCall(MatDestroySubMatrix_Private(submatj)); 235 PetscCall(PetscFree(C->defaultvectype)); 236 PetscCall(PetscLayoutDestroy(&C->rmap)); 237 PetscCall(PetscLayoutDestroy(&C->cmap)); 238 PetscCall(PetscHeaderDestroy(&C)); 239 } 240 } else { 241 PetscCall(MatDestroy(&C)); 242 } 243 } 244 245 /* Destroy Dummy submatrices created for reuse */ 246 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 247 248 PetscCall(PetscFree(*mat)); 249 PetscFunctionReturn(0); 250 } 251 252 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { 253 PetscInt i; 254 255 PetscFunctionBegin; 256 if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(n + 1, B)); } 257 258 for (i = 0; i < n; i++) { PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); } 259 PetscFunctionReturn(0); 260 } 261 262 /* -------------------------------------------------------*/ 263 /* Should check that shapes of vectors and matrices match */ 264 /* -------------------------------------------------------*/ 265 266 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) { 267 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 268 PetscScalar *z, sum; 269 const PetscScalar *x; 270 const MatScalar *v; 271 PetscInt mbs, i, n; 272 const PetscInt *idx, *ii, *ridx = NULL; 273 PetscBool usecprow = a->compressedrow.use; 274 275 PetscFunctionBegin; 276 PetscCall(VecGetArrayRead(xx, &x)); 277 PetscCall(VecGetArrayWrite(zz, &z)); 278 279 if (usecprow) { 280 mbs = a->compressedrow.nrows; 281 ii = a->compressedrow.i; 282 ridx = a->compressedrow.rindex; 283 PetscCall(PetscArrayzero(z, a->mbs)); 284 } else { 285 mbs = a->mbs; 286 ii = a->i; 287 } 288 289 for (i = 0; i < mbs; i++) { 290 n = ii[1] - ii[0]; 291 v = a->a + ii[0]; 292 idx = a->j + ii[0]; 293 ii++; 294 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 295 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 296 sum = 0.0; 297 PetscSparseDensePlusDot(sum, x, v, idx, n); 298 if (usecprow) { 299 z[ridx[i]] = sum; 300 } else { 301 z[i] = sum; 302 } 303 } 304 PetscCall(VecRestoreArrayRead(xx, &x)); 305 PetscCall(VecRestoreArrayWrite(zz, &z)); 306 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 307 PetscFunctionReturn(0); 308 } 309 310 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) { 311 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 312 PetscScalar *z = NULL, sum1, sum2, *zarray; 313 const PetscScalar *x, *xb; 314 PetscScalar x1, x2; 315 const MatScalar *v; 316 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 317 PetscBool usecprow = a->compressedrow.use; 318 319 PetscFunctionBegin; 320 PetscCall(VecGetArrayRead(xx, &x)); 321 PetscCall(VecGetArrayWrite(zz, &zarray)); 322 323 idx = a->j; 324 v = a->a; 325 if (usecprow) { 326 mbs = a->compressedrow.nrows; 327 ii = a->compressedrow.i; 328 ridx = a->compressedrow.rindex; 329 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 330 } else { 331 mbs = a->mbs; 332 ii = a->i; 333 z = zarray; 334 } 335 336 for (i = 0; i < mbs; i++) { 337 n = ii[1] - ii[0]; 338 ii++; 339 sum1 = 0.0; 340 sum2 = 0.0; 341 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 342 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 343 for (j = 0; j < n; j++) { 344 xb = x + 2 * (*idx++); 345 x1 = xb[0]; 346 x2 = xb[1]; 347 sum1 += v[0] * x1 + v[2] * x2; 348 sum2 += v[1] * x1 + v[3] * x2; 349 v += 4; 350 } 351 if (usecprow) z = zarray + 2 * ridx[i]; 352 z[0] = sum1; 353 z[1] = sum2; 354 if (!usecprow) z += 2; 355 } 356 PetscCall(VecRestoreArrayRead(xx, &x)); 357 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 358 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 359 PetscFunctionReturn(0); 360 } 361 362 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) { 363 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 364 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 365 const PetscScalar *x, *xb; 366 const MatScalar *v; 367 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 368 PetscBool usecprow = a->compressedrow.use; 369 370 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 371 #pragma disjoint(*v, *z, *xb) 372 #endif 373 374 PetscFunctionBegin; 375 PetscCall(VecGetArrayRead(xx, &x)); 376 PetscCall(VecGetArrayWrite(zz, &zarray)); 377 378 idx = a->j; 379 v = a->a; 380 if (usecprow) { 381 mbs = a->compressedrow.nrows; 382 ii = a->compressedrow.i; 383 ridx = a->compressedrow.rindex; 384 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 385 } else { 386 mbs = a->mbs; 387 ii = a->i; 388 z = zarray; 389 } 390 391 for (i = 0; i < mbs; i++) { 392 n = ii[1] - ii[0]; 393 ii++; 394 sum1 = 0.0; 395 sum2 = 0.0; 396 sum3 = 0.0; 397 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 398 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 399 for (j = 0; j < n; j++) { 400 xb = x + 3 * (*idx++); 401 x1 = xb[0]; 402 x2 = xb[1]; 403 x3 = xb[2]; 404 405 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 406 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 407 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 408 v += 9; 409 } 410 if (usecprow) z = zarray + 3 * ridx[i]; 411 z[0] = sum1; 412 z[1] = sum2; 413 z[2] = sum3; 414 if (!usecprow) z += 3; 415 } 416 PetscCall(VecRestoreArrayRead(xx, &x)); 417 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 418 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 419 PetscFunctionReturn(0); 420 } 421 422 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) { 423 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 424 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 425 const PetscScalar *x, *xb; 426 const MatScalar *v; 427 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 428 PetscBool usecprow = a->compressedrow.use; 429 430 PetscFunctionBegin; 431 PetscCall(VecGetArrayRead(xx, &x)); 432 PetscCall(VecGetArrayWrite(zz, &zarray)); 433 434 idx = a->j; 435 v = a->a; 436 if (usecprow) { 437 mbs = a->compressedrow.nrows; 438 ii = a->compressedrow.i; 439 ridx = a->compressedrow.rindex; 440 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 441 } else { 442 mbs = a->mbs; 443 ii = a->i; 444 z = zarray; 445 } 446 447 for (i = 0; i < mbs; i++) { 448 n = ii[1] - ii[0]; 449 ii++; 450 sum1 = 0.0; 451 sum2 = 0.0; 452 sum3 = 0.0; 453 sum4 = 0.0; 454 455 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 456 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 457 for (j = 0; j < n; j++) { 458 xb = x + 4 * (*idx++); 459 x1 = xb[0]; 460 x2 = xb[1]; 461 x3 = xb[2]; 462 x4 = xb[3]; 463 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 464 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 465 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 466 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 467 v += 16; 468 } 469 if (usecprow) z = zarray + 4 * ridx[i]; 470 z[0] = sum1; 471 z[1] = sum2; 472 z[2] = sum3; 473 z[3] = sum4; 474 if (!usecprow) z += 4; 475 } 476 PetscCall(VecRestoreArrayRead(xx, &x)); 477 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 478 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 479 PetscFunctionReturn(0); 480 } 481 482 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) { 483 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 484 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 485 const PetscScalar *xb, *x; 486 const MatScalar *v; 487 const PetscInt *idx, *ii, *ridx = NULL; 488 PetscInt mbs, i, j, n; 489 PetscBool usecprow = a->compressedrow.use; 490 491 PetscFunctionBegin; 492 PetscCall(VecGetArrayRead(xx, &x)); 493 PetscCall(VecGetArrayWrite(zz, &zarray)); 494 495 idx = a->j; 496 v = a->a; 497 if (usecprow) { 498 mbs = a->compressedrow.nrows; 499 ii = a->compressedrow.i; 500 ridx = a->compressedrow.rindex; 501 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 502 } else { 503 mbs = a->mbs; 504 ii = a->i; 505 z = zarray; 506 } 507 508 for (i = 0; i < mbs; i++) { 509 n = ii[1] - ii[0]; 510 ii++; 511 sum1 = 0.0; 512 sum2 = 0.0; 513 sum3 = 0.0; 514 sum4 = 0.0; 515 sum5 = 0.0; 516 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 517 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 518 for (j = 0; j < n; j++) { 519 xb = x + 5 * (*idx++); 520 x1 = xb[0]; 521 x2 = xb[1]; 522 x3 = xb[2]; 523 x4 = xb[3]; 524 x5 = xb[4]; 525 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 526 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 527 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 528 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 529 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 530 v += 25; 531 } 532 if (usecprow) z = zarray + 5 * ridx[i]; 533 z[0] = sum1; 534 z[1] = sum2; 535 z[2] = sum3; 536 z[3] = sum4; 537 z[4] = sum5; 538 if (!usecprow) z += 5; 539 } 540 PetscCall(VecRestoreArrayRead(xx, &x)); 541 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 542 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 543 PetscFunctionReturn(0); 544 } 545 546 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) { 547 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 548 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 549 const PetscScalar *x, *xb; 550 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 551 const MatScalar *v; 552 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 553 PetscBool usecprow = a->compressedrow.use; 554 555 PetscFunctionBegin; 556 PetscCall(VecGetArrayRead(xx, &x)); 557 PetscCall(VecGetArrayWrite(zz, &zarray)); 558 559 idx = a->j; 560 v = a->a; 561 if (usecprow) { 562 mbs = a->compressedrow.nrows; 563 ii = a->compressedrow.i; 564 ridx = a->compressedrow.rindex; 565 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 566 } else { 567 mbs = a->mbs; 568 ii = a->i; 569 z = zarray; 570 } 571 572 for (i = 0; i < mbs; i++) { 573 n = ii[1] - ii[0]; 574 ii++; 575 sum1 = 0.0; 576 sum2 = 0.0; 577 sum3 = 0.0; 578 sum4 = 0.0; 579 sum5 = 0.0; 580 sum6 = 0.0; 581 582 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 583 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 584 for (j = 0; j < n; j++) { 585 xb = x + 6 * (*idx++); 586 x1 = xb[0]; 587 x2 = xb[1]; 588 x3 = xb[2]; 589 x4 = xb[3]; 590 x5 = xb[4]; 591 x6 = xb[5]; 592 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 593 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 594 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 595 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 596 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 597 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 598 v += 36; 599 } 600 if (usecprow) z = zarray + 6 * ridx[i]; 601 z[0] = sum1; 602 z[1] = sum2; 603 z[2] = sum3; 604 z[3] = sum4; 605 z[4] = sum5; 606 z[5] = sum6; 607 if (!usecprow) z += 6; 608 } 609 610 PetscCall(VecRestoreArrayRead(xx, &x)); 611 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 612 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 613 PetscFunctionReturn(0); 614 } 615 616 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) { 617 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 618 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 619 const PetscScalar *x, *xb; 620 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 621 const MatScalar *v; 622 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 623 PetscBool usecprow = a->compressedrow.use; 624 625 PetscFunctionBegin; 626 PetscCall(VecGetArrayRead(xx, &x)); 627 PetscCall(VecGetArrayWrite(zz, &zarray)); 628 629 idx = a->j; 630 v = a->a; 631 if (usecprow) { 632 mbs = a->compressedrow.nrows; 633 ii = a->compressedrow.i; 634 ridx = a->compressedrow.rindex; 635 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 636 } else { 637 mbs = a->mbs; 638 ii = a->i; 639 z = zarray; 640 } 641 642 for (i = 0; i < mbs; i++) { 643 n = ii[1] - ii[0]; 644 ii++; 645 sum1 = 0.0; 646 sum2 = 0.0; 647 sum3 = 0.0; 648 sum4 = 0.0; 649 sum5 = 0.0; 650 sum6 = 0.0; 651 sum7 = 0.0; 652 653 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 654 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 655 for (j = 0; j < n; j++) { 656 xb = x + 7 * (*idx++); 657 x1 = xb[0]; 658 x2 = xb[1]; 659 x3 = xb[2]; 660 x4 = xb[3]; 661 x5 = xb[4]; 662 x6 = xb[5]; 663 x7 = xb[6]; 664 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 665 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 666 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 667 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 668 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 669 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 670 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 671 v += 49; 672 } 673 if (usecprow) z = zarray + 7 * ridx[i]; 674 z[0] = sum1; 675 z[1] = sum2; 676 z[2] = sum3; 677 z[3] = sum4; 678 z[4] = sum5; 679 z[5] = sum6; 680 z[6] = sum7; 681 if (!usecprow) z += 7; 682 } 683 684 PetscCall(VecRestoreArrayRead(xx, &x)); 685 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 686 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 687 PetscFunctionReturn(0); 688 } 689 690 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 691 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) { 692 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 693 PetscScalar *z = NULL, *work, *workt, *zarray; 694 const PetscScalar *x, *xb; 695 const MatScalar *v; 696 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 697 const PetscInt *idx, *ii, *ridx = NULL; 698 PetscInt k; 699 PetscBool usecprow = a->compressedrow.use; 700 701 __m256d a0, a1, a2, a3, a4, a5; 702 __m256d w0, w1, w2, w3; 703 __m256d z0, z1, z2; 704 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 705 706 PetscFunctionBegin; 707 PetscCall(VecGetArrayRead(xx, &x)); 708 PetscCall(VecGetArrayWrite(zz, &zarray)); 709 710 idx = a->j; 711 v = a->a; 712 if (usecprow) { 713 mbs = a->compressedrow.nrows; 714 ii = a->compressedrow.i; 715 ridx = a->compressedrow.rindex; 716 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 717 } else { 718 mbs = a->mbs; 719 ii = a->i; 720 z = zarray; 721 } 722 723 if (!a->mult_work) { 724 k = PetscMax(A->rmap->n, A->cmap->n); 725 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 726 } 727 728 work = a->mult_work; 729 for (i = 0; i < mbs; i++) { 730 n = ii[1] - ii[0]; 731 ii++; 732 workt = work; 733 for (j = 0; j < n; j++) { 734 xb = x + bs * (*idx++); 735 for (k = 0; k < bs; k++) workt[k] = xb[k]; 736 workt += bs; 737 } 738 if (usecprow) z = zarray + bs * ridx[i]; 739 740 z0 = _mm256_setzero_pd(); 741 z1 = _mm256_setzero_pd(); 742 z2 = _mm256_setzero_pd(); 743 744 for (j = 0; j < n; j++) { 745 /* first column of a */ 746 w0 = _mm256_set1_pd(work[j * 9]); 747 a0 = _mm256_loadu_pd(&v[j * 81]); 748 z0 = _mm256_fmadd_pd(a0, w0, z0); 749 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 750 z1 = _mm256_fmadd_pd(a1, w0, z1); 751 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 752 z2 = _mm256_fmadd_pd(a2, w0, z2); 753 754 /* second column of a */ 755 w1 = _mm256_set1_pd(work[j * 9 + 1]); 756 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 757 z0 = _mm256_fmadd_pd(a0, w1, z0); 758 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 759 z1 = _mm256_fmadd_pd(a1, w1, z1); 760 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 761 z2 = _mm256_fmadd_pd(a2, w1, z2); 762 763 /* third column of a */ 764 w2 = _mm256_set1_pd(work[j * 9 + 2]); 765 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 766 z0 = _mm256_fmadd_pd(a3, w2, z0); 767 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 768 z1 = _mm256_fmadd_pd(a4, w2, z1); 769 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 770 z2 = _mm256_fmadd_pd(a5, w2, z2); 771 772 /* fourth column of a */ 773 w3 = _mm256_set1_pd(work[j * 9 + 3]); 774 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 775 z0 = _mm256_fmadd_pd(a0, w3, z0); 776 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 777 z1 = _mm256_fmadd_pd(a1, w3, z1); 778 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 779 z2 = _mm256_fmadd_pd(a2, w3, z2); 780 781 /* fifth column of a */ 782 w0 = _mm256_set1_pd(work[j * 9 + 4]); 783 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 784 z0 = _mm256_fmadd_pd(a3, w0, z0); 785 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 786 z1 = _mm256_fmadd_pd(a4, w0, z1); 787 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 788 z2 = _mm256_fmadd_pd(a5, w0, z2); 789 790 /* sixth column of a */ 791 w1 = _mm256_set1_pd(work[j * 9 + 5]); 792 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 793 z0 = _mm256_fmadd_pd(a0, w1, z0); 794 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 795 z1 = _mm256_fmadd_pd(a1, w1, z1); 796 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 797 z2 = _mm256_fmadd_pd(a2, w1, z2); 798 799 /* seventh column of a */ 800 w2 = _mm256_set1_pd(work[j * 9 + 6]); 801 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 802 z0 = _mm256_fmadd_pd(a0, w2, z0); 803 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 804 z1 = _mm256_fmadd_pd(a1, w2, z1); 805 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 806 z2 = _mm256_fmadd_pd(a2, w2, z2); 807 808 /* eighth column of a */ 809 w3 = _mm256_set1_pd(work[j * 9 + 7]); 810 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 811 z0 = _mm256_fmadd_pd(a3, w3, z0); 812 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 813 z1 = _mm256_fmadd_pd(a4, w3, z1); 814 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 815 z2 = _mm256_fmadd_pd(a5, w3, z2); 816 817 /* ninth column of a */ 818 w0 = _mm256_set1_pd(work[j * 9 + 8]); 819 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 820 z0 = _mm256_fmadd_pd(a0, w0, z0); 821 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 822 z1 = _mm256_fmadd_pd(a1, w0, z1); 823 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 824 z2 = _mm256_fmadd_pd(a2, w0, z2); 825 } 826 827 _mm256_storeu_pd(&z[0], z0); 828 _mm256_storeu_pd(&z[4], z1); 829 _mm256_maskstore_pd(&z[8], mask1, z2); 830 831 v += n * bs2; 832 if (!usecprow) z += bs; 833 } 834 PetscCall(VecRestoreArrayRead(xx, &x)); 835 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 836 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 837 PetscFunctionReturn(0); 838 } 839 #endif 840 841 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) { 842 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 843 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 844 const PetscScalar *x, *xb; 845 PetscScalar *zarray, xv; 846 const MatScalar *v; 847 const PetscInt *ii, *ij = a->j, *idx; 848 PetscInt mbs, i, j, k, n, *ridx = NULL; 849 PetscBool usecprow = a->compressedrow.use; 850 851 PetscFunctionBegin; 852 PetscCall(VecGetArrayRead(xx, &x)); 853 PetscCall(VecGetArrayWrite(zz, &zarray)); 854 855 v = a->a; 856 if (usecprow) { 857 mbs = a->compressedrow.nrows; 858 ii = a->compressedrow.i; 859 ridx = a->compressedrow.rindex; 860 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 861 } else { 862 mbs = a->mbs; 863 ii = a->i; 864 z = zarray; 865 } 866 867 for (i = 0; i < mbs; i++) { 868 n = ii[i + 1] - ii[i]; 869 idx = ij + ii[i]; 870 sum1 = 0.0; 871 sum2 = 0.0; 872 sum3 = 0.0; 873 sum4 = 0.0; 874 sum5 = 0.0; 875 sum6 = 0.0; 876 sum7 = 0.0; 877 sum8 = 0.0; 878 sum9 = 0.0; 879 sum10 = 0.0; 880 sum11 = 0.0; 881 882 for (j = 0; j < n; j++) { 883 xb = x + 11 * (idx[j]); 884 885 for (k = 0; k < 11; k++) { 886 xv = xb[k]; 887 sum1 += v[0] * xv; 888 sum2 += v[1] * xv; 889 sum3 += v[2] * xv; 890 sum4 += v[3] * xv; 891 sum5 += v[4] * xv; 892 sum6 += v[5] * xv; 893 sum7 += v[6] * xv; 894 sum8 += v[7] * xv; 895 sum9 += v[8] * xv; 896 sum10 += v[9] * xv; 897 sum11 += v[10] * xv; 898 v += 11; 899 } 900 } 901 if (usecprow) z = zarray + 11 * ridx[i]; 902 z[0] = sum1; 903 z[1] = sum2; 904 z[2] = sum3; 905 z[3] = sum4; 906 z[4] = sum5; 907 z[5] = sum6; 908 z[6] = sum7; 909 z[7] = sum8; 910 z[8] = sum9; 911 z[9] = sum10; 912 z[10] = sum11; 913 914 if (!usecprow) z += 11; 915 } 916 917 PetscCall(VecRestoreArrayRead(xx, &x)); 918 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 919 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 920 PetscFunctionReturn(0); 921 } 922 923 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 924 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) { 925 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 926 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 927 const PetscScalar *x, *xb; 928 PetscScalar *zarray, xv; 929 const MatScalar *v; 930 const PetscInt *ii, *ij = a->j, *idx; 931 PetscInt mbs, i, j, k, n, *ridx = NULL; 932 PetscBool usecprow = a->compressedrow.use; 933 934 PetscFunctionBegin; 935 PetscCall(VecGetArrayRead(xx, &x)); 936 PetscCall(VecGetArrayWrite(zz, &zarray)); 937 938 v = a->a; 939 if (usecprow) { 940 mbs = a->compressedrow.nrows; 941 ii = a->compressedrow.i; 942 ridx = a->compressedrow.rindex; 943 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 944 } else { 945 mbs = a->mbs; 946 ii = a->i; 947 z = zarray; 948 } 949 950 for (i = 0; i < mbs; i++) { 951 n = ii[i + 1] - ii[i]; 952 idx = ij + ii[i]; 953 sum1 = 0.0; 954 sum2 = 0.0; 955 sum3 = 0.0; 956 sum4 = 0.0; 957 sum5 = 0.0; 958 sum6 = 0.0; 959 sum7 = 0.0; 960 sum8 = 0.0; 961 sum9 = 0.0; 962 sum10 = 0.0; 963 sum11 = 0.0; 964 sum12 = 0.0; 965 966 for (j = 0; j < n; j++) { 967 xb = x + 12 * (idx[j]); 968 969 for (k = 0; k < 12; k++) { 970 xv = xb[k]; 971 sum1 += v[0] * xv; 972 sum2 += v[1] * xv; 973 sum3 += v[2] * xv; 974 sum4 += v[3] * xv; 975 sum5 += v[4] * xv; 976 sum6 += v[5] * xv; 977 sum7 += v[6] * xv; 978 sum8 += v[7] * xv; 979 sum9 += v[8] * xv; 980 sum10 += v[9] * xv; 981 sum11 += v[10] * xv; 982 sum12 += v[11] * xv; 983 v += 12; 984 } 985 } 986 if (usecprow) z = zarray + 12 * ridx[i]; 987 z[0] = sum1; 988 z[1] = sum2; 989 z[2] = sum3; 990 z[3] = sum4; 991 z[4] = sum5; 992 z[5] = sum6; 993 z[6] = sum7; 994 z[7] = sum8; 995 z[8] = sum9; 996 z[9] = sum10; 997 z[10] = sum11; 998 z[11] = sum12; 999 if (!usecprow) z += 12; 1000 } 1001 PetscCall(VecRestoreArrayRead(xx, &x)); 1002 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1003 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1004 PetscFunctionReturn(0); 1005 } 1006 1007 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) { 1008 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1009 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1010 const PetscScalar *x, *xb; 1011 PetscScalar *zarray, *yarray, xv; 1012 const MatScalar *v; 1013 const PetscInt *ii, *ij = a->j, *idx; 1014 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 1015 PetscBool usecprow = a->compressedrow.use; 1016 1017 PetscFunctionBegin; 1018 PetscCall(VecGetArrayRead(xx, &x)); 1019 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1020 1021 v = a->a; 1022 if (usecprow) { 1023 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); } 1024 mbs = a->compressedrow.nrows; 1025 ii = a->compressedrow.i; 1026 ridx = a->compressedrow.rindex; 1027 } else { 1028 ii = a->i; 1029 y = yarray; 1030 z = zarray; 1031 } 1032 1033 for (i = 0; i < mbs; i++) { 1034 n = ii[i + 1] - ii[i]; 1035 idx = ij + ii[i]; 1036 1037 if (usecprow) { 1038 y = yarray + 12 * ridx[i]; 1039 z = zarray + 12 * ridx[i]; 1040 } 1041 sum1 = y[0]; 1042 sum2 = y[1]; 1043 sum3 = y[2]; 1044 sum4 = y[3]; 1045 sum5 = y[4]; 1046 sum6 = y[5]; 1047 sum7 = y[6]; 1048 sum8 = y[7]; 1049 sum9 = y[8]; 1050 sum10 = y[9]; 1051 sum11 = y[10]; 1052 sum12 = y[11]; 1053 1054 for (j = 0; j < n; j++) { 1055 xb = x + 12 * (idx[j]); 1056 1057 for (k = 0; k < 12; k++) { 1058 xv = xb[k]; 1059 sum1 += v[0] * xv; 1060 sum2 += v[1] * xv; 1061 sum3 += v[2] * xv; 1062 sum4 += v[3] * xv; 1063 sum5 += v[4] * xv; 1064 sum6 += v[5] * xv; 1065 sum7 += v[6] * xv; 1066 sum8 += v[7] * xv; 1067 sum9 += v[8] * xv; 1068 sum10 += v[9] * xv; 1069 sum11 += v[10] * xv; 1070 sum12 += v[11] * xv; 1071 v += 12; 1072 } 1073 } 1074 1075 z[0] = sum1; 1076 z[1] = sum2; 1077 z[2] = sum3; 1078 z[3] = sum4; 1079 z[4] = sum5; 1080 z[5] = sum6; 1081 z[6] = sum7; 1082 z[7] = sum8; 1083 z[8] = sum9; 1084 z[9] = sum10; 1085 z[10] = sum11; 1086 z[11] = sum12; 1087 if (!usecprow) { 1088 y += 12; 1089 z += 12; 1090 } 1091 } 1092 PetscCall(VecRestoreArrayRead(xx, &x)); 1093 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1094 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1095 PetscFunctionReturn(0); 1096 } 1097 1098 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1099 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) { 1100 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1101 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1102 const PetscScalar *x, *xb; 1103 PetscScalar x1, x2, x3, x4, *zarray; 1104 const MatScalar *v; 1105 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1106 PetscInt mbs, i, j, n; 1107 PetscBool usecprow = a->compressedrow.use; 1108 1109 PetscFunctionBegin; 1110 PetscCall(VecGetArrayRead(xx, &x)); 1111 PetscCall(VecGetArrayWrite(zz, &zarray)); 1112 1113 v = a->a; 1114 if (usecprow) { 1115 mbs = a->compressedrow.nrows; 1116 ii = a->compressedrow.i; 1117 ridx = a->compressedrow.rindex; 1118 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1119 } else { 1120 mbs = a->mbs; 1121 ii = a->i; 1122 z = zarray; 1123 } 1124 1125 for (i = 0; i < mbs; i++) { 1126 n = ii[i + 1] - ii[i]; 1127 idx = ij + ii[i]; 1128 1129 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1130 for (j = 0; j < n; j++) { 1131 xb = x + 12 * (idx[j]); 1132 x1 = xb[0]; 1133 x2 = xb[1]; 1134 x3 = xb[2]; 1135 x4 = xb[3]; 1136 1137 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1138 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1139 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1140 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1141 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1142 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1143 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1144 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1145 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1146 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1147 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1148 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1149 v += 48; 1150 1151 x1 = xb[4]; 1152 x2 = xb[5]; 1153 x3 = xb[6]; 1154 x4 = xb[7]; 1155 1156 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1157 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1158 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1159 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1160 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1161 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1162 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1163 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1164 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1165 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1166 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1167 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1168 v += 48; 1169 1170 x1 = xb[8]; 1171 x2 = xb[9]; 1172 x3 = xb[10]; 1173 x4 = xb[11]; 1174 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1175 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1176 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1177 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1178 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1179 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1180 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1181 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1182 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1183 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1184 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1185 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1186 v += 48; 1187 } 1188 if (usecprow) z = zarray + 12 * ridx[i]; 1189 z[0] = sum1; 1190 z[1] = sum2; 1191 z[2] = sum3; 1192 z[3] = sum4; 1193 z[4] = sum5; 1194 z[5] = sum6; 1195 z[6] = sum7; 1196 z[7] = sum8; 1197 z[8] = sum9; 1198 z[9] = sum10; 1199 z[10] = sum11; 1200 z[11] = sum12; 1201 if (!usecprow) z += 12; 1202 } 1203 PetscCall(VecRestoreArrayRead(xx, &x)); 1204 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1205 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1206 PetscFunctionReturn(0); 1207 } 1208 1209 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1210 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) { 1211 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1212 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1213 const PetscScalar *x, *xb; 1214 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1215 const MatScalar *v; 1216 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1217 PetscInt mbs = a->mbs, i, j, n; 1218 PetscBool usecprow = a->compressedrow.use; 1219 1220 PetscFunctionBegin; 1221 PetscCall(VecGetArrayRead(xx, &x)); 1222 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1223 1224 v = a->a; 1225 if (usecprow) { 1226 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); } 1227 mbs = a->compressedrow.nrows; 1228 ii = a->compressedrow.i; 1229 ridx = a->compressedrow.rindex; 1230 } else { 1231 ii = a->i; 1232 y = yarray; 1233 z = zarray; 1234 } 1235 1236 for (i = 0; i < mbs; i++) { 1237 n = ii[i + 1] - ii[i]; 1238 idx = ij + ii[i]; 1239 1240 if (usecprow) { 1241 y = yarray + 12 * ridx[i]; 1242 z = zarray + 12 * ridx[i]; 1243 } 1244 sum1 = y[0]; 1245 sum2 = y[1]; 1246 sum3 = y[2]; 1247 sum4 = y[3]; 1248 sum5 = y[4]; 1249 sum6 = y[5]; 1250 sum7 = y[6]; 1251 sum8 = y[7]; 1252 sum9 = y[8]; 1253 sum10 = y[9]; 1254 sum11 = y[10]; 1255 sum12 = y[11]; 1256 1257 for (j = 0; j < n; j++) { 1258 xb = x + 12 * (idx[j]); 1259 x1 = xb[0]; 1260 x2 = xb[1]; 1261 x3 = xb[2]; 1262 x4 = xb[3]; 1263 1264 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1265 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1266 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1267 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1268 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1269 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1270 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1271 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1272 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1273 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1274 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1275 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1276 v += 48; 1277 1278 x1 = xb[4]; 1279 x2 = xb[5]; 1280 x3 = xb[6]; 1281 x4 = xb[7]; 1282 1283 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1284 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1285 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1286 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1287 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1288 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1289 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1290 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1291 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1292 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1293 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1294 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1295 v += 48; 1296 1297 x1 = xb[8]; 1298 x2 = xb[9]; 1299 x3 = xb[10]; 1300 x4 = xb[11]; 1301 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1302 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1303 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1304 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1305 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1306 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1307 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1308 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1309 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1310 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1311 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1312 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1313 v += 48; 1314 } 1315 z[0] = sum1; 1316 z[1] = sum2; 1317 z[2] = sum3; 1318 z[3] = sum4; 1319 z[4] = sum5; 1320 z[5] = sum6; 1321 z[6] = sum7; 1322 z[7] = sum8; 1323 z[8] = sum9; 1324 z[9] = sum10; 1325 z[10] = sum11; 1326 z[11] = sum12; 1327 if (!usecprow) { 1328 y += 12; 1329 z += 12; 1330 } 1331 } 1332 PetscCall(VecRestoreArrayRead(xx, &x)); 1333 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1334 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1335 PetscFunctionReturn(0); 1336 } 1337 1338 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1339 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) { 1340 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1341 PetscScalar *z = NULL, *zarray; 1342 const PetscScalar *x, *work; 1343 const MatScalar *v = a->a; 1344 PetscInt mbs, i, j, n; 1345 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1346 PetscBool usecprow = a->compressedrow.use; 1347 const PetscInt bs = 12, bs2 = 144; 1348 1349 __m256d a0, a1, a2, a3, a4, a5; 1350 __m256d w0, w1, w2, w3; 1351 __m256d z0, z1, z2; 1352 1353 PetscFunctionBegin; 1354 PetscCall(VecGetArrayRead(xx, &x)); 1355 PetscCall(VecGetArrayWrite(zz, &zarray)); 1356 1357 if (usecprow) { 1358 mbs = a->compressedrow.nrows; 1359 ii = a->compressedrow.i; 1360 ridx = a->compressedrow.rindex; 1361 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1362 } else { 1363 mbs = a->mbs; 1364 ii = a->i; 1365 z = zarray; 1366 } 1367 1368 for (i = 0; i < mbs; i++) { 1369 z0 = _mm256_setzero_pd(); 1370 z1 = _mm256_setzero_pd(); 1371 z2 = _mm256_setzero_pd(); 1372 1373 n = ii[1] - ii[0]; 1374 ii++; 1375 for (j = 0; j < n; j++) { 1376 work = x + bs * (*idx++); 1377 1378 /* first column of a */ 1379 w0 = _mm256_set1_pd(work[0]); 1380 a0 = _mm256_loadu_pd(v + 0); 1381 z0 = _mm256_fmadd_pd(a0, w0, z0); 1382 a1 = _mm256_loadu_pd(v + 4); 1383 z1 = _mm256_fmadd_pd(a1, w0, z1); 1384 a2 = _mm256_loadu_pd(v + 8); 1385 z2 = _mm256_fmadd_pd(a2, w0, z2); 1386 1387 /* second column of a */ 1388 w1 = _mm256_set1_pd(work[1]); 1389 a3 = _mm256_loadu_pd(v + 12); 1390 z0 = _mm256_fmadd_pd(a3, w1, z0); 1391 a4 = _mm256_loadu_pd(v + 16); 1392 z1 = _mm256_fmadd_pd(a4, w1, z1); 1393 a5 = _mm256_loadu_pd(v + 20); 1394 z2 = _mm256_fmadd_pd(a5, w1, z2); 1395 1396 /* third column of a */ 1397 w2 = _mm256_set1_pd(work[2]); 1398 a0 = _mm256_loadu_pd(v + 24); 1399 z0 = _mm256_fmadd_pd(a0, w2, z0); 1400 a1 = _mm256_loadu_pd(v + 28); 1401 z1 = _mm256_fmadd_pd(a1, w2, z1); 1402 a2 = _mm256_loadu_pd(v + 32); 1403 z2 = _mm256_fmadd_pd(a2, w2, z2); 1404 1405 /* fourth column of a */ 1406 w3 = _mm256_set1_pd(work[3]); 1407 a3 = _mm256_loadu_pd(v + 36); 1408 z0 = _mm256_fmadd_pd(a3, w3, z0); 1409 a4 = _mm256_loadu_pd(v + 40); 1410 z1 = _mm256_fmadd_pd(a4, w3, z1); 1411 a5 = _mm256_loadu_pd(v + 44); 1412 z2 = _mm256_fmadd_pd(a5, w3, z2); 1413 1414 /* fifth column of a */ 1415 w0 = _mm256_set1_pd(work[4]); 1416 a0 = _mm256_loadu_pd(v + 48); 1417 z0 = _mm256_fmadd_pd(a0, w0, z0); 1418 a1 = _mm256_loadu_pd(v + 52); 1419 z1 = _mm256_fmadd_pd(a1, w0, z1); 1420 a2 = _mm256_loadu_pd(v + 56); 1421 z2 = _mm256_fmadd_pd(a2, w0, z2); 1422 1423 /* sixth column of a */ 1424 w1 = _mm256_set1_pd(work[5]); 1425 a3 = _mm256_loadu_pd(v + 60); 1426 z0 = _mm256_fmadd_pd(a3, w1, z0); 1427 a4 = _mm256_loadu_pd(v + 64); 1428 z1 = _mm256_fmadd_pd(a4, w1, z1); 1429 a5 = _mm256_loadu_pd(v + 68); 1430 z2 = _mm256_fmadd_pd(a5, w1, z2); 1431 1432 /* seventh column of a */ 1433 w2 = _mm256_set1_pd(work[6]); 1434 a0 = _mm256_loadu_pd(v + 72); 1435 z0 = _mm256_fmadd_pd(a0, w2, z0); 1436 a1 = _mm256_loadu_pd(v + 76); 1437 z1 = _mm256_fmadd_pd(a1, w2, z1); 1438 a2 = _mm256_loadu_pd(v + 80); 1439 z2 = _mm256_fmadd_pd(a2, w2, z2); 1440 1441 /* eighth column of a */ 1442 w3 = _mm256_set1_pd(work[7]); 1443 a3 = _mm256_loadu_pd(v + 84); 1444 z0 = _mm256_fmadd_pd(a3, w3, z0); 1445 a4 = _mm256_loadu_pd(v + 88); 1446 z1 = _mm256_fmadd_pd(a4, w3, z1); 1447 a5 = _mm256_loadu_pd(v + 92); 1448 z2 = _mm256_fmadd_pd(a5, w3, z2); 1449 1450 /* ninth column of a */ 1451 w0 = _mm256_set1_pd(work[8]); 1452 a0 = _mm256_loadu_pd(v + 96); 1453 z0 = _mm256_fmadd_pd(a0, w0, z0); 1454 a1 = _mm256_loadu_pd(v + 100); 1455 z1 = _mm256_fmadd_pd(a1, w0, z1); 1456 a2 = _mm256_loadu_pd(v + 104); 1457 z2 = _mm256_fmadd_pd(a2, w0, z2); 1458 1459 /* tenth column of a */ 1460 w1 = _mm256_set1_pd(work[9]); 1461 a3 = _mm256_loadu_pd(v + 108); 1462 z0 = _mm256_fmadd_pd(a3, w1, z0); 1463 a4 = _mm256_loadu_pd(v + 112); 1464 z1 = _mm256_fmadd_pd(a4, w1, z1); 1465 a5 = _mm256_loadu_pd(v + 116); 1466 z2 = _mm256_fmadd_pd(a5, w1, z2); 1467 1468 /* eleventh column of a */ 1469 w2 = _mm256_set1_pd(work[10]); 1470 a0 = _mm256_loadu_pd(v + 120); 1471 z0 = _mm256_fmadd_pd(a0, w2, z0); 1472 a1 = _mm256_loadu_pd(v + 124); 1473 z1 = _mm256_fmadd_pd(a1, w2, z1); 1474 a2 = _mm256_loadu_pd(v + 128); 1475 z2 = _mm256_fmadd_pd(a2, w2, z2); 1476 1477 /* twelveth column of a */ 1478 w3 = _mm256_set1_pd(work[11]); 1479 a3 = _mm256_loadu_pd(v + 132); 1480 z0 = _mm256_fmadd_pd(a3, w3, z0); 1481 a4 = _mm256_loadu_pd(v + 136); 1482 z1 = _mm256_fmadd_pd(a4, w3, z1); 1483 a5 = _mm256_loadu_pd(v + 140); 1484 z2 = _mm256_fmadd_pd(a5, w3, z2); 1485 1486 v += bs2; 1487 } 1488 if (usecprow) z = zarray + bs * ridx[i]; 1489 _mm256_storeu_pd(&z[0], z0); 1490 _mm256_storeu_pd(&z[4], z1); 1491 _mm256_storeu_pd(&z[8], z2); 1492 if (!usecprow) z += bs; 1493 } 1494 PetscCall(VecRestoreArrayRead(xx, &x)); 1495 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1496 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1497 PetscFunctionReturn(0); 1498 } 1499 #endif 1500 1501 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1502 /* Default MatMult for block size 15 */ 1503 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) { 1504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1505 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1506 const PetscScalar *x, *xb; 1507 PetscScalar *zarray, xv; 1508 const MatScalar *v; 1509 const PetscInt *ii, *ij = a->j, *idx; 1510 PetscInt mbs, i, j, k, n, *ridx = NULL; 1511 PetscBool usecprow = a->compressedrow.use; 1512 1513 PetscFunctionBegin; 1514 PetscCall(VecGetArrayRead(xx, &x)); 1515 PetscCall(VecGetArrayWrite(zz, &zarray)); 1516 1517 v = a->a; 1518 if (usecprow) { 1519 mbs = a->compressedrow.nrows; 1520 ii = a->compressedrow.i; 1521 ridx = a->compressedrow.rindex; 1522 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1523 } else { 1524 mbs = a->mbs; 1525 ii = a->i; 1526 z = zarray; 1527 } 1528 1529 for (i = 0; i < mbs; i++) { 1530 n = ii[i + 1] - ii[i]; 1531 idx = ij + ii[i]; 1532 sum1 = 0.0; 1533 sum2 = 0.0; 1534 sum3 = 0.0; 1535 sum4 = 0.0; 1536 sum5 = 0.0; 1537 sum6 = 0.0; 1538 sum7 = 0.0; 1539 sum8 = 0.0; 1540 sum9 = 0.0; 1541 sum10 = 0.0; 1542 sum11 = 0.0; 1543 sum12 = 0.0; 1544 sum13 = 0.0; 1545 sum14 = 0.0; 1546 sum15 = 0.0; 1547 1548 for (j = 0; j < n; j++) { 1549 xb = x + 15 * (idx[j]); 1550 1551 for (k = 0; k < 15; k++) { 1552 xv = xb[k]; 1553 sum1 += v[0] * xv; 1554 sum2 += v[1] * xv; 1555 sum3 += v[2] * xv; 1556 sum4 += v[3] * xv; 1557 sum5 += v[4] * xv; 1558 sum6 += v[5] * xv; 1559 sum7 += v[6] * xv; 1560 sum8 += v[7] * xv; 1561 sum9 += v[8] * xv; 1562 sum10 += v[9] * xv; 1563 sum11 += v[10] * xv; 1564 sum12 += v[11] * xv; 1565 sum13 += v[12] * xv; 1566 sum14 += v[13] * xv; 1567 sum15 += v[14] * xv; 1568 v += 15; 1569 } 1570 } 1571 if (usecprow) z = zarray + 15 * ridx[i]; 1572 z[0] = sum1; 1573 z[1] = sum2; 1574 z[2] = sum3; 1575 z[3] = sum4; 1576 z[4] = sum5; 1577 z[5] = sum6; 1578 z[6] = sum7; 1579 z[7] = sum8; 1580 z[8] = sum9; 1581 z[9] = sum10; 1582 z[10] = sum11; 1583 z[11] = sum12; 1584 z[12] = sum13; 1585 z[13] = sum14; 1586 z[14] = sum15; 1587 1588 if (!usecprow) z += 15; 1589 } 1590 1591 PetscCall(VecRestoreArrayRead(xx, &x)); 1592 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1593 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1598 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) { 1599 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1600 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1601 const PetscScalar *x, *xb; 1602 PetscScalar x1, x2, x3, x4, *zarray; 1603 const MatScalar *v; 1604 const PetscInt *ii, *ij = a->j, *idx; 1605 PetscInt mbs, i, j, n, *ridx = NULL; 1606 PetscBool usecprow = a->compressedrow.use; 1607 1608 PetscFunctionBegin; 1609 PetscCall(VecGetArrayRead(xx, &x)); 1610 PetscCall(VecGetArrayWrite(zz, &zarray)); 1611 1612 v = a->a; 1613 if (usecprow) { 1614 mbs = a->compressedrow.nrows; 1615 ii = a->compressedrow.i; 1616 ridx = a->compressedrow.rindex; 1617 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1618 } else { 1619 mbs = a->mbs; 1620 ii = a->i; 1621 z = zarray; 1622 } 1623 1624 for (i = 0; i < mbs; i++) { 1625 n = ii[i + 1] - ii[i]; 1626 idx = ij + ii[i]; 1627 sum1 = 0.0; 1628 sum2 = 0.0; 1629 sum3 = 0.0; 1630 sum4 = 0.0; 1631 sum5 = 0.0; 1632 sum6 = 0.0; 1633 sum7 = 0.0; 1634 sum8 = 0.0; 1635 sum9 = 0.0; 1636 sum10 = 0.0; 1637 sum11 = 0.0; 1638 sum12 = 0.0; 1639 sum13 = 0.0; 1640 sum14 = 0.0; 1641 sum15 = 0.0; 1642 1643 for (j = 0; j < n; j++) { 1644 xb = x + 15 * (idx[j]); 1645 x1 = xb[0]; 1646 x2 = xb[1]; 1647 x3 = xb[2]; 1648 x4 = xb[3]; 1649 1650 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1651 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1652 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1653 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1654 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1655 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1656 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1657 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1658 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1659 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1660 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1661 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1662 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1663 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1664 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1665 1666 v += 60; 1667 1668 x1 = xb[4]; 1669 x2 = xb[5]; 1670 x3 = xb[6]; 1671 x4 = xb[7]; 1672 1673 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1674 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1675 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1676 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1677 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1678 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1679 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1680 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1681 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1682 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1683 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1684 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1685 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1686 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1687 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1688 v += 60; 1689 1690 x1 = xb[8]; 1691 x2 = xb[9]; 1692 x3 = xb[10]; 1693 x4 = xb[11]; 1694 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1695 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1696 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1697 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1698 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1699 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1700 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1701 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1702 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1703 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1704 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1705 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1706 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1707 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1708 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1709 v += 60; 1710 1711 x1 = xb[12]; 1712 x2 = xb[13]; 1713 x3 = xb[14]; 1714 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1715 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1716 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1717 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1718 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1719 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1720 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1721 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1722 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1723 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1724 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1725 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1726 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1727 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1728 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1729 v += 45; 1730 } 1731 if (usecprow) z = zarray + 15 * ridx[i]; 1732 z[0] = sum1; 1733 z[1] = sum2; 1734 z[2] = sum3; 1735 z[3] = sum4; 1736 z[4] = sum5; 1737 z[5] = sum6; 1738 z[6] = sum7; 1739 z[7] = sum8; 1740 z[8] = sum9; 1741 z[9] = sum10; 1742 z[10] = sum11; 1743 z[11] = sum12; 1744 z[12] = sum13; 1745 z[13] = sum14; 1746 z[14] = sum15; 1747 1748 if (!usecprow) z += 15; 1749 } 1750 1751 PetscCall(VecRestoreArrayRead(xx, &x)); 1752 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1753 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1754 PetscFunctionReturn(0); 1755 } 1756 1757 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1758 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) { 1759 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1760 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1761 const PetscScalar *x, *xb; 1762 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1763 const MatScalar *v; 1764 const PetscInt *ii, *ij = a->j, *idx; 1765 PetscInt mbs, i, j, n, *ridx = NULL; 1766 PetscBool usecprow = a->compressedrow.use; 1767 1768 PetscFunctionBegin; 1769 PetscCall(VecGetArrayRead(xx, &x)); 1770 PetscCall(VecGetArrayWrite(zz, &zarray)); 1771 1772 v = a->a; 1773 if (usecprow) { 1774 mbs = a->compressedrow.nrows; 1775 ii = a->compressedrow.i; 1776 ridx = a->compressedrow.rindex; 1777 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1778 } else { 1779 mbs = a->mbs; 1780 ii = a->i; 1781 z = zarray; 1782 } 1783 1784 for (i = 0; i < mbs; i++) { 1785 n = ii[i + 1] - ii[i]; 1786 idx = ij + ii[i]; 1787 sum1 = 0.0; 1788 sum2 = 0.0; 1789 sum3 = 0.0; 1790 sum4 = 0.0; 1791 sum5 = 0.0; 1792 sum6 = 0.0; 1793 sum7 = 0.0; 1794 sum8 = 0.0; 1795 sum9 = 0.0; 1796 sum10 = 0.0; 1797 sum11 = 0.0; 1798 sum12 = 0.0; 1799 sum13 = 0.0; 1800 sum14 = 0.0; 1801 sum15 = 0.0; 1802 1803 for (j = 0; j < n; j++) { 1804 xb = x + 15 * (idx[j]); 1805 x1 = xb[0]; 1806 x2 = xb[1]; 1807 x3 = xb[2]; 1808 x4 = xb[3]; 1809 x5 = xb[4]; 1810 x6 = xb[5]; 1811 x7 = xb[6]; 1812 x8 = xb[7]; 1813 1814 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1815 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1816 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1817 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1818 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1819 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1820 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1821 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1822 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1823 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1824 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1825 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1826 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1827 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1828 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1829 v += 120; 1830 1831 x1 = xb[8]; 1832 x2 = xb[9]; 1833 x3 = xb[10]; 1834 x4 = xb[11]; 1835 x5 = xb[12]; 1836 x6 = xb[13]; 1837 x7 = xb[14]; 1838 1839 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1840 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1841 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1842 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1843 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1844 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1845 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1846 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1847 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1848 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1849 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1850 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1851 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1852 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1853 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1854 v += 105; 1855 } 1856 if (usecprow) z = zarray + 15 * ridx[i]; 1857 z[0] = sum1; 1858 z[1] = sum2; 1859 z[2] = sum3; 1860 z[3] = sum4; 1861 z[4] = sum5; 1862 z[5] = sum6; 1863 z[6] = sum7; 1864 z[7] = sum8; 1865 z[8] = sum9; 1866 z[9] = sum10; 1867 z[10] = sum11; 1868 z[11] = sum12; 1869 z[12] = sum13; 1870 z[13] = sum14; 1871 z[14] = sum15; 1872 1873 if (!usecprow) z += 15; 1874 } 1875 1876 PetscCall(VecRestoreArrayRead(xx, &x)); 1877 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1878 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1879 PetscFunctionReturn(0); 1880 } 1881 1882 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1883 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) { 1884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1885 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1886 const PetscScalar *x, *xb; 1887 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1888 const MatScalar *v; 1889 const PetscInt *ii, *ij = a->j, *idx; 1890 PetscInt mbs, i, j, n, *ridx = NULL; 1891 PetscBool usecprow = a->compressedrow.use; 1892 1893 PetscFunctionBegin; 1894 PetscCall(VecGetArrayRead(xx, &x)); 1895 PetscCall(VecGetArrayWrite(zz, &zarray)); 1896 1897 v = a->a; 1898 if (usecprow) { 1899 mbs = a->compressedrow.nrows; 1900 ii = a->compressedrow.i; 1901 ridx = a->compressedrow.rindex; 1902 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1903 } else { 1904 mbs = a->mbs; 1905 ii = a->i; 1906 z = zarray; 1907 } 1908 1909 for (i = 0; i < mbs; i++) { 1910 n = ii[i + 1] - ii[i]; 1911 idx = ij + ii[i]; 1912 sum1 = 0.0; 1913 sum2 = 0.0; 1914 sum3 = 0.0; 1915 sum4 = 0.0; 1916 sum5 = 0.0; 1917 sum6 = 0.0; 1918 sum7 = 0.0; 1919 sum8 = 0.0; 1920 sum9 = 0.0; 1921 sum10 = 0.0; 1922 sum11 = 0.0; 1923 sum12 = 0.0; 1924 sum13 = 0.0; 1925 sum14 = 0.0; 1926 sum15 = 0.0; 1927 1928 for (j = 0; j < n; j++) { 1929 xb = x + 15 * (idx[j]); 1930 x1 = xb[0]; 1931 x2 = xb[1]; 1932 x3 = xb[2]; 1933 x4 = xb[3]; 1934 x5 = xb[4]; 1935 x6 = xb[5]; 1936 x7 = xb[6]; 1937 x8 = xb[7]; 1938 x9 = xb[8]; 1939 x10 = xb[9]; 1940 x11 = xb[10]; 1941 x12 = xb[11]; 1942 x13 = xb[12]; 1943 x14 = xb[13]; 1944 x15 = xb[14]; 1945 1946 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1947 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1948 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1949 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1950 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1951 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1952 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1953 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1954 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1955 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1956 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1957 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1958 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1959 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1960 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1961 v += 225; 1962 } 1963 if (usecprow) z = zarray + 15 * ridx[i]; 1964 z[0] = sum1; 1965 z[1] = sum2; 1966 z[2] = sum3; 1967 z[3] = sum4; 1968 z[4] = sum5; 1969 z[5] = sum6; 1970 z[6] = sum7; 1971 z[7] = sum8; 1972 z[8] = sum9; 1973 z[9] = sum10; 1974 z[10] = sum11; 1975 z[11] = sum12; 1976 z[12] = sum13; 1977 z[13] = sum14; 1978 z[14] = sum15; 1979 1980 if (!usecprow) z += 15; 1981 } 1982 1983 PetscCall(VecRestoreArrayRead(xx, &x)); 1984 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1985 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1986 PetscFunctionReturn(0); 1987 } 1988 1989 /* 1990 This will not work with MatScalar == float because it calls the BLAS 1991 */ 1992 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) { 1993 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1994 PetscScalar *z = NULL, *work, *workt, *zarray; 1995 const PetscScalar *x, *xb; 1996 const MatScalar *v; 1997 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1998 const PetscInt *idx, *ii, *ridx = NULL; 1999 PetscInt ncols, k; 2000 PetscBool usecprow = a->compressedrow.use; 2001 2002 PetscFunctionBegin; 2003 PetscCall(VecGetArrayRead(xx, &x)); 2004 PetscCall(VecGetArrayWrite(zz, &zarray)); 2005 2006 idx = a->j; 2007 v = a->a; 2008 if (usecprow) { 2009 mbs = a->compressedrow.nrows; 2010 ii = a->compressedrow.i; 2011 ridx = a->compressedrow.rindex; 2012 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2013 } else { 2014 mbs = a->mbs; 2015 ii = a->i; 2016 z = zarray; 2017 } 2018 2019 if (!a->mult_work) { 2020 k = PetscMax(A->rmap->n, A->cmap->n); 2021 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2022 } 2023 work = a->mult_work; 2024 for (i = 0; i < mbs; i++) { 2025 n = ii[1] - ii[0]; 2026 ii++; 2027 ncols = n * bs; 2028 workt = work; 2029 for (j = 0; j < n; j++) { 2030 xb = x + bs * (*idx++); 2031 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2032 workt += bs; 2033 } 2034 if (usecprow) z = zarray + bs * ridx[i]; 2035 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2036 v += n * bs2; 2037 if (!usecprow) z += bs; 2038 } 2039 PetscCall(VecRestoreArrayRead(xx, &x)); 2040 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2041 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2042 PetscFunctionReturn(0); 2043 } 2044 2045 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { 2046 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2047 const PetscScalar *x; 2048 PetscScalar *y, *z, sum; 2049 const MatScalar *v; 2050 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2051 const PetscInt *idx, *ii; 2052 PetscBool usecprow = a->compressedrow.use; 2053 2054 PetscFunctionBegin; 2055 PetscCall(VecGetArrayRead(xx, &x)); 2056 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2057 2058 idx = a->j; 2059 v = a->a; 2060 if (usecprow) { 2061 if (zz != yy) { PetscCall(PetscArraycpy(z, y, mbs)); } 2062 mbs = a->compressedrow.nrows; 2063 ii = a->compressedrow.i; 2064 ridx = a->compressedrow.rindex; 2065 } else { 2066 ii = a->i; 2067 } 2068 2069 for (i = 0; i < mbs; i++) { 2070 n = ii[1] - ii[0]; 2071 ii++; 2072 if (!usecprow) { 2073 sum = y[i]; 2074 } else { 2075 sum = y[ridx[i]]; 2076 } 2077 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2078 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2079 PetscSparseDensePlusDot(sum, x, v, idx, n); 2080 v += n; 2081 idx += n; 2082 if (usecprow) { 2083 z[ridx[i]] = sum; 2084 } else { 2085 z[i] = sum; 2086 } 2087 } 2088 PetscCall(VecRestoreArrayRead(xx, &x)); 2089 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2090 PetscCall(PetscLogFlops(2.0 * a->nz)); 2091 PetscFunctionReturn(0); 2092 } 2093 2094 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { 2095 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2096 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2097 const PetscScalar *x, *xb; 2098 PetscScalar x1, x2, *yarray, *zarray; 2099 const MatScalar *v; 2100 PetscInt mbs = a->mbs, i, n, j; 2101 const PetscInt *idx, *ii, *ridx = NULL; 2102 PetscBool usecprow = a->compressedrow.use; 2103 2104 PetscFunctionBegin; 2105 PetscCall(VecGetArrayRead(xx, &x)); 2106 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2107 2108 idx = a->j; 2109 v = a->a; 2110 if (usecprow) { 2111 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); } 2112 mbs = a->compressedrow.nrows; 2113 ii = a->compressedrow.i; 2114 ridx = a->compressedrow.rindex; 2115 } else { 2116 ii = a->i; 2117 y = yarray; 2118 z = zarray; 2119 } 2120 2121 for (i = 0; i < mbs; i++) { 2122 n = ii[1] - ii[0]; 2123 ii++; 2124 if (usecprow) { 2125 z = zarray + 2 * ridx[i]; 2126 y = yarray + 2 * ridx[i]; 2127 } 2128 sum1 = y[0]; 2129 sum2 = y[1]; 2130 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2131 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2132 for (j = 0; j < n; j++) { 2133 xb = x + 2 * (*idx++); 2134 x1 = xb[0]; 2135 x2 = xb[1]; 2136 2137 sum1 += v[0] * x1 + v[2] * x2; 2138 sum2 += v[1] * x1 + v[3] * x2; 2139 v += 4; 2140 } 2141 z[0] = sum1; 2142 z[1] = sum2; 2143 if (!usecprow) { 2144 z += 2; 2145 y += 2; 2146 } 2147 } 2148 PetscCall(VecRestoreArrayRead(xx, &x)); 2149 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2150 PetscCall(PetscLogFlops(4.0 * a->nz)); 2151 PetscFunctionReturn(0); 2152 } 2153 2154 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { 2155 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2156 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2157 const PetscScalar *x, *xb; 2158 const MatScalar *v; 2159 PetscInt mbs = a->mbs, i, j, n; 2160 const PetscInt *idx, *ii, *ridx = NULL; 2161 PetscBool usecprow = a->compressedrow.use; 2162 2163 PetscFunctionBegin; 2164 PetscCall(VecGetArrayRead(xx, &x)); 2165 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2166 2167 idx = a->j; 2168 v = a->a; 2169 if (usecprow) { 2170 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); } 2171 mbs = a->compressedrow.nrows; 2172 ii = a->compressedrow.i; 2173 ridx = a->compressedrow.rindex; 2174 } else { 2175 ii = a->i; 2176 y = yarray; 2177 z = zarray; 2178 } 2179 2180 for (i = 0; i < mbs; i++) { 2181 n = ii[1] - ii[0]; 2182 ii++; 2183 if (usecprow) { 2184 z = zarray + 3 * ridx[i]; 2185 y = yarray + 3 * ridx[i]; 2186 } 2187 sum1 = y[0]; 2188 sum2 = y[1]; 2189 sum3 = y[2]; 2190 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2191 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2192 for (j = 0; j < n; j++) { 2193 xb = x + 3 * (*idx++); 2194 x1 = xb[0]; 2195 x2 = xb[1]; 2196 x3 = xb[2]; 2197 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2198 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2199 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2200 v += 9; 2201 } 2202 z[0] = sum1; 2203 z[1] = sum2; 2204 z[2] = sum3; 2205 if (!usecprow) { 2206 z += 3; 2207 y += 3; 2208 } 2209 } 2210 PetscCall(VecRestoreArrayRead(xx, &x)); 2211 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2212 PetscCall(PetscLogFlops(18.0 * a->nz)); 2213 PetscFunctionReturn(0); 2214 } 2215 2216 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { 2217 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2218 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2219 const PetscScalar *x, *xb; 2220 const MatScalar *v; 2221 PetscInt mbs = a->mbs, i, j, n; 2222 const PetscInt *idx, *ii, *ridx = NULL; 2223 PetscBool usecprow = a->compressedrow.use; 2224 2225 PetscFunctionBegin; 2226 PetscCall(VecGetArrayRead(xx, &x)); 2227 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2228 2229 idx = a->j; 2230 v = a->a; 2231 if (usecprow) { 2232 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); } 2233 mbs = a->compressedrow.nrows; 2234 ii = a->compressedrow.i; 2235 ridx = a->compressedrow.rindex; 2236 } else { 2237 ii = a->i; 2238 y = yarray; 2239 z = zarray; 2240 } 2241 2242 for (i = 0; i < mbs; i++) { 2243 n = ii[1] - ii[0]; 2244 ii++; 2245 if (usecprow) { 2246 z = zarray + 4 * ridx[i]; 2247 y = yarray + 4 * ridx[i]; 2248 } 2249 sum1 = y[0]; 2250 sum2 = y[1]; 2251 sum3 = y[2]; 2252 sum4 = y[3]; 2253 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2254 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2255 for (j = 0; j < n; j++) { 2256 xb = x + 4 * (*idx++); 2257 x1 = xb[0]; 2258 x2 = xb[1]; 2259 x3 = xb[2]; 2260 x4 = xb[3]; 2261 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2262 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2263 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2264 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2265 v += 16; 2266 } 2267 z[0] = sum1; 2268 z[1] = sum2; 2269 z[2] = sum3; 2270 z[3] = sum4; 2271 if (!usecprow) { 2272 z += 4; 2273 y += 4; 2274 } 2275 } 2276 PetscCall(VecRestoreArrayRead(xx, &x)); 2277 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2278 PetscCall(PetscLogFlops(32.0 * a->nz)); 2279 PetscFunctionReturn(0); 2280 } 2281 2282 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { 2283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2284 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2285 const PetscScalar *x, *xb; 2286 PetscScalar *yarray, *zarray; 2287 const MatScalar *v; 2288 PetscInt mbs = a->mbs, i, j, n; 2289 const PetscInt *idx, *ii, *ridx = NULL; 2290 PetscBool usecprow = a->compressedrow.use; 2291 2292 PetscFunctionBegin; 2293 PetscCall(VecGetArrayRead(xx, &x)); 2294 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2295 2296 idx = a->j; 2297 v = a->a; 2298 if (usecprow) { 2299 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); } 2300 mbs = a->compressedrow.nrows; 2301 ii = a->compressedrow.i; 2302 ridx = a->compressedrow.rindex; 2303 } else { 2304 ii = a->i; 2305 y = yarray; 2306 z = zarray; 2307 } 2308 2309 for (i = 0; i < mbs; i++) { 2310 n = ii[1] - ii[0]; 2311 ii++; 2312 if (usecprow) { 2313 z = zarray + 5 * ridx[i]; 2314 y = yarray + 5 * ridx[i]; 2315 } 2316 sum1 = y[0]; 2317 sum2 = y[1]; 2318 sum3 = y[2]; 2319 sum4 = y[3]; 2320 sum5 = y[4]; 2321 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2322 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2323 for (j = 0; j < n; j++) { 2324 xb = x + 5 * (*idx++); 2325 x1 = xb[0]; 2326 x2 = xb[1]; 2327 x3 = xb[2]; 2328 x4 = xb[3]; 2329 x5 = xb[4]; 2330 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2331 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2332 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2333 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2334 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2335 v += 25; 2336 } 2337 z[0] = sum1; 2338 z[1] = sum2; 2339 z[2] = sum3; 2340 z[3] = sum4; 2341 z[4] = sum5; 2342 if (!usecprow) { 2343 z += 5; 2344 y += 5; 2345 } 2346 } 2347 PetscCall(VecRestoreArrayRead(xx, &x)); 2348 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2349 PetscCall(PetscLogFlops(50.0 * a->nz)); 2350 PetscFunctionReturn(0); 2351 } 2352 2353 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { 2354 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2355 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2356 const PetscScalar *x, *xb; 2357 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2358 const MatScalar *v; 2359 PetscInt mbs = a->mbs, i, j, n; 2360 const PetscInt *idx, *ii, *ridx = NULL; 2361 PetscBool usecprow = a->compressedrow.use; 2362 2363 PetscFunctionBegin; 2364 PetscCall(VecGetArrayRead(xx, &x)); 2365 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2366 2367 idx = a->j; 2368 v = a->a; 2369 if (usecprow) { 2370 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); } 2371 mbs = a->compressedrow.nrows; 2372 ii = a->compressedrow.i; 2373 ridx = a->compressedrow.rindex; 2374 } else { 2375 ii = a->i; 2376 y = yarray; 2377 z = zarray; 2378 } 2379 2380 for (i = 0; i < mbs; i++) { 2381 n = ii[1] - ii[0]; 2382 ii++; 2383 if (usecprow) { 2384 z = zarray + 6 * ridx[i]; 2385 y = yarray + 6 * ridx[i]; 2386 } 2387 sum1 = y[0]; 2388 sum2 = y[1]; 2389 sum3 = y[2]; 2390 sum4 = y[3]; 2391 sum5 = y[4]; 2392 sum6 = y[5]; 2393 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2394 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2395 for (j = 0; j < n; j++) { 2396 xb = x + 6 * (*idx++); 2397 x1 = xb[0]; 2398 x2 = xb[1]; 2399 x3 = xb[2]; 2400 x4 = xb[3]; 2401 x5 = xb[4]; 2402 x6 = xb[5]; 2403 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2404 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2405 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2406 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2407 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2408 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2409 v += 36; 2410 } 2411 z[0] = sum1; 2412 z[1] = sum2; 2413 z[2] = sum3; 2414 z[3] = sum4; 2415 z[4] = sum5; 2416 z[5] = sum6; 2417 if (!usecprow) { 2418 z += 6; 2419 y += 6; 2420 } 2421 } 2422 PetscCall(VecRestoreArrayRead(xx, &x)); 2423 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2424 PetscCall(PetscLogFlops(72.0 * a->nz)); 2425 PetscFunctionReturn(0); 2426 } 2427 2428 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { 2429 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2430 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2431 const PetscScalar *x, *xb; 2432 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2433 const MatScalar *v; 2434 PetscInt mbs = a->mbs, i, j, n; 2435 const PetscInt *idx, *ii, *ridx = NULL; 2436 PetscBool usecprow = a->compressedrow.use; 2437 2438 PetscFunctionBegin; 2439 PetscCall(VecGetArrayRead(xx, &x)); 2440 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2441 2442 idx = a->j; 2443 v = a->a; 2444 if (usecprow) { 2445 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); } 2446 mbs = a->compressedrow.nrows; 2447 ii = a->compressedrow.i; 2448 ridx = a->compressedrow.rindex; 2449 } else { 2450 ii = a->i; 2451 y = yarray; 2452 z = zarray; 2453 } 2454 2455 for (i = 0; i < mbs; i++) { 2456 n = ii[1] - ii[0]; 2457 ii++; 2458 if (usecprow) { 2459 z = zarray + 7 * ridx[i]; 2460 y = yarray + 7 * ridx[i]; 2461 } 2462 sum1 = y[0]; 2463 sum2 = y[1]; 2464 sum3 = y[2]; 2465 sum4 = y[3]; 2466 sum5 = y[4]; 2467 sum6 = y[5]; 2468 sum7 = y[6]; 2469 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2470 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2471 for (j = 0; j < n; j++) { 2472 xb = x + 7 * (*idx++); 2473 x1 = xb[0]; 2474 x2 = xb[1]; 2475 x3 = xb[2]; 2476 x4 = xb[3]; 2477 x5 = xb[4]; 2478 x6 = xb[5]; 2479 x7 = xb[6]; 2480 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2481 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2482 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2483 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2484 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2485 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2486 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2487 v += 49; 2488 } 2489 z[0] = sum1; 2490 z[1] = sum2; 2491 z[2] = sum3; 2492 z[3] = sum4; 2493 z[4] = sum5; 2494 z[5] = sum6; 2495 z[6] = sum7; 2496 if (!usecprow) { 2497 z += 7; 2498 y += 7; 2499 } 2500 } 2501 PetscCall(VecRestoreArrayRead(xx, &x)); 2502 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2503 PetscCall(PetscLogFlops(98.0 * a->nz)); 2504 PetscFunctionReturn(0); 2505 } 2506 2507 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2508 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) { 2509 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2510 PetscScalar *z = NULL, *work, *workt, *zarray; 2511 const PetscScalar *x, *xb; 2512 const MatScalar *v; 2513 PetscInt mbs, i, j, n; 2514 PetscInt k; 2515 PetscBool usecprow = a->compressedrow.use; 2516 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2517 2518 __m256d a0, a1, a2, a3, a4, a5; 2519 __m256d w0, w1, w2, w3; 2520 __m256d z0, z1, z2; 2521 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2522 2523 PetscFunctionBegin; 2524 PetscCall(VecCopy(yy, zz)); 2525 PetscCall(VecGetArrayRead(xx, &x)); 2526 PetscCall(VecGetArray(zz, &zarray)); 2527 2528 idx = a->j; 2529 v = a->a; 2530 if (usecprow) { 2531 mbs = a->compressedrow.nrows; 2532 ii = a->compressedrow.i; 2533 ridx = a->compressedrow.rindex; 2534 } else { 2535 mbs = a->mbs; 2536 ii = a->i; 2537 z = zarray; 2538 } 2539 2540 if (!a->mult_work) { 2541 k = PetscMax(A->rmap->n, A->cmap->n); 2542 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2543 } 2544 2545 work = a->mult_work; 2546 for (i = 0; i < mbs; i++) { 2547 n = ii[1] - ii[0]; 2548 ii++; 2549 workt = work; 2550 for (j = 0; j < n; j++) { 2551 xb = x + bs * (*idx++); 2552 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2553 workt += bs; 2554 } 2555 if (usecprow) z = zarray + bs * ridx[i]; 2556 2557 z0 = _mm256_loadu_pd(&z[0]); 2558 z1 = _mm256_loadu_pd(&z[4]); 2559 z2 = _mm256_set1_pd(z[8]); 2560 2561 for (j = 0; j < n; j++) { 2562 /* first column of a */ 2563 w0 = _mm256_set1_pd(work[j * 9]); 2564 a0 = _mm256_loadu_pd(&v[j * 81]); 2565 z0 = _mm256_fmadd_pd(a0, w0, z0); 2566 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2567 z1 = _mm256_fmadd_pd(a1, w0, z1); 2568 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2569 z2 = _mm256_fmadd_pd(a2, w0, z2); 2570 2571 /* second column of a */ 2572 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2573 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2574 z0 = _mm256_fmadd_pd(a0, w1, z0); 2575 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2576 z1 = _mm256_fmadd_pd(a1, w1, z1); 2577 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2578 z2 = _mm256_fmadd_pd(a2, w1, z2); 2579 2580 /* third column of a */ 2581 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2582 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2583 z0 = _mm256_fmadd_pd(a3, w2, z0); 2584 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2585 z1 = _mm256_fmadd_pd(a4, w2, z1); 2586 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2587 z2 = _mm256_fmadd_pd(a5, w2, z2); 2588 2589 /* fourth column of a */ 2590 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2591 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2592 z0 = _mm256_fmadd_pd(a0, w3, z0); 2593 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2594 z1 = _mm256_fmadd_pd(a1, w3, z1); 2595 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2596 z2 = _mm256_fmadd_pd(a2, w3, z2); 2597 2598 /* fifth column of a */ 2599 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2600 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2601 z0 = _mm256_fmadd_pd(a3, w0, z0); 2602 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2603 z1 = _mm256_fmadd_pd(a4, w0, z1); 2604 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2605 z2 = _mm256_fmadd_pd(a5, w0, z2); 2606 2607 /* sixth column of a */ 2608 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2609 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2610 z0 = _mm256_fmadd_pd(a0, w1, z0); 2611 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2612 z1 = _mm256_fmadd_pd(a1, w1, z1); 2613 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2614 z2 = _mm256_fmadd_pd(a2, w1, z2); 2615 2616 /* seventh column of a */ 2617 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2618 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2619 z0 = _mm256_fmadd_pd(a0, w2, z0); 2620 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2621 z1 = _mm256_fmadd_pd(a1, w2, z1); 2622 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2623 z2 = _mm256_fmadd_pd(a2, w2, z2); 2624 2625 /* eighth column of a */ 2626 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2627 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2628 z0 = _mm256_fmadd_pd(a3, w3, z0); 2629 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2630 z1 = _mm256_fmadd_pd(a4, w3, z1); 2631 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2632 z2 = _mm256_fmadd_pd(a5, w3, z2); 2633 2634 /* ninth column of a */ 2635 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2636 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2637 z0 = _mm256_fmadd_pd(a0, w0, z0); 2638 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2639 z1 = _mm256_fmadd_pd(a1, w0, z1); 2640 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2641 z2 = _mm256_fmadd_pd(a2, w0, z2); 2642 } 2643 2644 _mm256_storeu_pd(&z[0], z0); 2645 _mm256_storeu_pd(&z[4], z1); 2646 _mm256_maskstore_pd(&z[8], mask1, z2); 2647 2648 v += n * bs2; 2649 if (!usecprow) z += bs; 2650 } 2651 PetscCall(VecRestoreArrayRead(xx, &x)); 2652 PetscCall(VecRestoreArray(zz, &zarray)); 2653 PetscCall(PetscLogFlops(162.0 * a->nz)); 2654 PetscFunctionReturn(0); 2655 } 2656 #endif 2657 2658 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) { 2659 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2660 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2661 const PetscScalar *x, *xb; 2662 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2663 const MatScalar *v; 2664 PetscInt mbs = a->mbs, i, j, n; 2665 const PetscInt *idx, *ii, *ridx = NULL; 2666 PetscBool usecprow = a->compressedrow.use; 2667 2668 PetscFunctionBegin; 2669 PetscCall(VecGetArrayRead(xx, &x)); 2670 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2671 2672 idx = a->j; 2673 v = a->a; 2674 if (usecprow) { 2675 if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); } 2676 mbs = a->compressedrow.nrows; 2677 ii = a->compressedrow.i; 2678 ridx = a->compressedrow.rindex; 2679 } else { 2680 ii = a->i; 2681 y = yarray; 2682 z = zarray; 2683 } 2684 2685 for (i = 0; i < mbs; i++) { 2686 n = ii[1] - ii[0]; 2687 ii++; 2688 if (usecprow) { 2689 z = zarray + 11 * ridx[i]; 2690 y = yarray + 11 * ridx[i]; 2691 } 2692 sum1 = y[0]; 2693 sum2 = y[1]; 2694 sum3 = y[2]; 2695 sum4 = y[3]; 2696 sum5 = y[4]; 2697 sum6 = y[5]; 2698 sum7 = y[6]; 2699 sum8 = y[7]; 2700 sum9 = y[8]; 2701 sum10 = y[9]; 2702 sum11 = y[10]; 2703 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2704 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2705 for (j = 0; j < n; j++) { 2706 xb = x + 11 * (*idx++); 2707 x1 = xb[0]; 2708 x2 = xb[1]; 2709 x3 = xb[2]; 2710 x4 = xb[3]; 2711 x5 = xb[4]; 2712 x6 = xb[5]; 2713 x7 = xb[6]; 2714 x8 = xb[7]; 2715 x9 = xb[8]; 2716 x10 = xb[9]; 2717 x11 = xb[10]; 2718 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2719 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2720 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2721 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2722 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2723 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2724 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2725 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2726 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2727 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2728 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2729 v += 121; 2730 } 2731 z[0] = sum1; 2732 z[1] = sum2; 2733 z[2] = sum3; 2734 z[3] = sum4; 2735 z[4] = sum5; 2736 z[5] = sum6; 2737 z[6] = sum7; 2738 z[7] = sum8; 2739 z[8] = sum9; 2740 z[9] = sum10; 2741 z[10] = sum11; 2742 if (!usecprow) { 2743 z += 11; 2744 y += 11; 2745 } 2746 } 2747 PetscCall(VecRestoreArrayRead(xx, &x)); 2748 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2749 PetscCall(PetscLogFlops(242.0 * a->nz)); 2750 PetscFunctionReturn(0); 2751 } 2752 2753 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { 2754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2755 PetscScalar *z = NULL, *work, *workt, *zarray; 2756 const PetscScalar *x, *xb; 2757 const MatScalar *v; 2758 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2759 PetscInt ncols, k; 2760 const PetscInt *ridx = NULL, *idx, *ii; 2761 PetscBool usecprow = a->compressedrow.use; 2762 2763 PetscFunctionBegin; 2764 PetscCall(VecCopy(yy, zz)); 2765 PetscCall(VecGetArrayRead(xx, &x)); 2766 PetscCall(VecGetArray(zz, &zarray)); 2767 2768 idx = a->j; 2769 v = a->a; 2770 if (usecprow) { 2771 mbs = a->compressedrow.nrows; 2772 ii = a->compressedrow.i; 2773 ridx = a->compressedrow.rindex; 2774 } else { 2775 mbs = a->mbs; 2776 ii = a->i; 2777 z = zarray; 2778 } 2779 2780 if (!a->mult_work) { 2781 k = PetscMax(A->rmap->n, A->cmap->n); 2782 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2783 } 2784 work = a->mult_work; 2785 for (i = 0; i < mbs; i++) { 2786 n = ii[1] - ii[0]; 2787 ii++; 2788 ncols = n * bs; 2789 workt = work; 2790 for (j = 0; j < n; j++) { 2791 xb = x + bs * (*idx++); 2792 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2793 workt += bs; 2794 } 2795 if (usecprow) z = zarray + bs * ridx[i]; 2796 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2797 v += n * bs2; 2798 if (!usecprow) z += bs; 2799 } 2800 PetscCall(VecRestoreArrayRead(xx, &x)); 2801 PetscCall(VecRestoreArray(zz, &zarray)); 2802 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2803 PetscFunctionReturn(0); 2804 } 2805 2806 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 2807 PetscScalar zero = 0.0; 2808 2809 PetscFunctionBegin; 2810 PetscCall(VecSet(zz, zero)); 2811 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2812 PetscFunctionReturn(0); 2813 } 2814 2815 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 2816 PetscScalar zero = 0.0; 2817 2818 PetscFunctionBegin; 2819 PetscCall(VecSet(zz, zero)); 2820 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2821 PetscFunctionReturn(0); 2822 } 2823 2824 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 2825 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2826 PetscScalar *z, x1, x2, x3, x4, x5; 2827 const PetscScalar *x, *xb = NULL; 2828 const MatScalar *v; 2829 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2830 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2831 Mat_CompressedRow cprow = a->compressedrow; 2832 PetscBool usecprow = cprow.use; 2833 2834 PetscFunctionBegin; 2835 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2836 PetscCall(VecGetArrayRead(xx, &x)); 2837 PetscCall(VecGetArray(zz, &z)); 2838 2839 idx = a->j; 2840 v = a->a; 2841 if (usecprow) { 2842 mbs = cprow.nrows; 2843 ii = cprow.i; 2844 ridx = cprow.rindex; 2845 } else { 2846 mbs = a->mbs; 2847 ii = a->i; 2848 xb = x; 2849 } 2850 2851 switch (bs) { 2852 case 1: 2853 for (i = 0; i < mbs; i++) { 2854 if (usecprow) xb = x + ridx[i]; 2855 x1 = xb[0]; 2856 ib = idx + ii[0]; 2857 n = ii[1] - ii[0]; 2858 ii++; 2859 for (j = 0; j < n; j++) { 2860 rval = ib[j]; 2861 z[rval] += PetscConj(*v) * x1; 2862 v++; 2863 } 2864 if (!usecprow) xb++; 2865 } 2866 break; 2867 case 2: 2868 for (i = 0; i < mbs; i++) { 2869 if (usecprow) xb = x + 2 * ridx[i]; 2870 x1 = xb[0]; 2871 x2 = xb[1]; 2872 ib = idx + ii[0]; 2873 n = ii[1] - ii[0]; 2874 ii++; 2875 for (j = 0; j < n; j++) { 2876 rval = ib[j] * 2; 2877 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2878 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2879 v += 4; 2880 } 2881 if (!usecprow) xb += 2; 2882 } 2883 break; 2884 case 3: 2885 for (i = 0; i < mbs; i++) { 2886 if (usecprow) xb = x + 3 * ridx[i]; 2887 x1 = xb[0]; 2888 x2 = xb[1]; 2889 x3 = xb[2]; 2890 ib = idx + ii[0]; 2891 n = ii[1] - ii[0]; 2892 ii++; 2893 for (j = 0; j < n; j++) { 2894 rval = ib[j] * 3; 2895 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2896 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2897 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2898 v += 9; 2899 } 2900 if (!usecprow) xb += 3; 2901 } 2902 break; 2903 case 4: 2904 for (i = 0; i < mbs; i++) { 2905 if (usecprow) xb = x + 4 * ridx[i]; 2906 x1 = xb[0]; 2907 x2 = xb[1]; 2908 x3 = xb[2]; 2909 x4 = xb[3]; 2910 ib = idx + ii[0]; 2911 n = ii[1] - ii[0]; 2912 ii++; 2913 for (j = 0; j < n; j++) { 2914 rval = ib[j] * 4; 2915 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2916 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2917 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2918 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2919 v += 16; 2920 } 2921 if (!usecprow) xb += 4; 2922 } 2923 break; 2924 case 5: 2925 for (i = 0; i < mbs; i++) { 2926 if (usecprow) xb = x + 5 * ridx[i]; 2927 x1 = xb[0]; 2928 x2 = xb[1]; 2929 x3 = xb[2]; 2930 x4 = xb[3]; 2931 x5 = xb[4]; 2932 ib = idx + ii[0]; 2933 n = ii[1] - ii[0]; 2934 ii++; 2935 for (j = 0; j < n; j++) { 2936 rval = ib[j] * 5; 2937 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2938 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2939 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2940 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2941 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2942 v += 25; 2943 } 2944 if (!usecprow) xb += 5; 2945 } 2946 break; 2947 default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2948 #if 0 2949 { 2950 PetscInt ncols,k,bs2=a->bs2; 2951 PetscScalar *work,*workt,zb; 2952 const PetscScalar *xtmp; 2953 if (!a->mult_work) { 2954 k = PetscMax(A->rmap->n,A->cmap->n); 2955 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2956 } 2957 work = a->mult_work; 2958 xtmp = x; 2959 for (i=0; i<mbs; i++) { 2960 n = ii[1] - ii[0]; ii++; 2961 ncols = n*bs; 2962 PetscCall(PetscArrayzero(work,ncols)); 2963 if (usecprow) xtmp = x + bs*ridx[i]; 2964 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2965 v += n*bs2; 2966 if (!usecprow) xtmp += bs; 2967 workt = work; 2968 for (j=0; j<n; j++) { 2969 zb = z + bs*(*idx++); 2970 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2971 workt += bs; 2972 } 2973 } 2974 } 2975 #endif 2976 } 2977 PetscCall(VecRestoreArrayRead(xx, &x)); 2978 PetscCall(VecRestoreArray(zz, &z)); 2979 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 2980 PetscFunctionReturn(0); 2981 } 2982 2983 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 2984 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2985 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2986 const PetscScalar *x, *xb = NULL; 2987 const MatScalar *v; 2988 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2989 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2990 Mat_CompressedRow cprow = a->compressedrow; 2991 PetscBool usecprow = cprow.use; 2992 2993 PetscFunctionBegin; 2994 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2995 PetscCall(VecGetArrayRead(xx, &x)); 2996 PetscCall(VecGetArray(zz, &z)); 2997 2998 idx = a->j; 2999 v = a->a; 3000 if (usecprow) { 3001 mbs = cprow.nrows; 3002 ii = cprow.i; 3003 ridx = cprow.rindex; 3004 } else { 3005 mbs = a->mbs; 3006 ii = a->i; 3007 xb = x; 3008 } 3009 3010 switch (bs) { 3011 case 1: 3012 for (i = 0; i < mbs; i++) { 3013 if (usecprow) xb = x + ridx[i]; 3014 x1 = xb[0]; 3015 ib = idx + ii[0]; 3016 n = ii[1] - ii[0]; 3017 ii++; 3018 for (j = 0; j < n; j++) { 3019 rval = ib[j]; 3020 z[rval] += *v * x1; 3021 v++; 3022 } 3023 if (!usecprow) xb++; 3024 } 3025 break; 3026 case 2: 3027 for (i = 0; i < mbs; i++) { 3028 if (usecprow) xb = x + 2 * ridx[i]; 3029 x1 = xb[0]; 3030 x2 = xb[1]; 3031 ib = idx + ii[0]; 3032 n = ii[1] - ii[0]; 3033 ii++; 3034 for (j = 0; j < n; j++) { 3035 rval = ib[j] * 2; 3036 z[rval++] += v[0] * x1 + v[1] * x2; 3037 z[rval++] += v[2] * x1 + v[3] * x2; 3038 v += 4; 3039 } 3040 if (!usecprow) xb += 2; 3041 } 3042 break; 3043 case 3: 3044 for (i = 0; i < mbs; i++) { 3045 if (usecprow) xb = x + 3 * ridx[i]; 3046 x1 = xb[0]; 3047 x2 = xb[1]; 3048 x3 = xb[2]; 3049 ib = idx + ii[0]; 3050 n = ii[1] - ii[0]; 3051 ii++; 3052 for (j = 0; j < n; j++) { 3053 rval = ib[j] * 3; 3054 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3055 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3056 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3057 v += 9; 3058 } 3059 if (!usecprow) xb += 3; 3060 } 3061 break; 3062 case 4: 3063 for (i = 0; i < mbs; i++) { 3064 if (usecprow) xb = x + 4 * ridx[i]; 3065 x1 = xb[0]; 3066 x2 = xb[1]; 3067 x3 = xb[2]; 3068 x4 = xb[3]; 3069 ib = idx + ii[0]; 3070 n = ii[1] - ii[0]; 3071 ii++; 3072 for (j = 0; j < n; j++) { 3073 rval = ib[j] * 4; 3074 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3075 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3076 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3077 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3078 v += 16; 3079 } 3080 if (!usecprow) xb += 4; 3081 } 3082 break; 3083 case 5: 3084 for (i = 0; i < mbs; i++) { 3085 if (usecprow) xb = x + 5 * ridx[i]; 3086 x1 = xb[0]; 3087 x2 = xb[1]; 3088 x3 = xb[2]; 3089 x4 = xb[3]; 3090 x5 = xb[4]; 3091 ib = idx + ii[0]; 3092 n = ii[1] - ii[0]; 3093 ii++; 3094 for (j = 0; j < n; j++) { 3095 rval = ib[j] * 5; 3096 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3097 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3098 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3099 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3100 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3101 v += 25; 3102 } 3103 if (!usecprow) xb += 5; 3104 } 3105 break; 3106 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3107 PetscInt ncols, k; 3108 PetscScalar *work, *workt; 3109 const PetscScalar *xtmp; 3110 if (!a->mult_work) { 3111 k = PetscMax(A->rmap->n, A->cmap->n); 3112 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3113 } 3114 work = a->mult_work; 3115 xtmp = x; 3116 for (i = 0; i < mbs; i++) { 3117 n = ii[1] - ii[0]; 3118 ii++; 3119 ncols = n * bs; 3120 PetscCall(PetscArrayzero(work, ncols)); 3121 if (usecprow) xtmp = x + bs * ridx[i]; 3122 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3123 v += n * bs2; 3124 if (!usecprow) xtmp += bs; 3125 workt = work; 3126 for (j = 0; j < n; j++) { 3127 zb = z + bs * (*idx++); 3128 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3129 workt += bs; 3130 } 3131 } 3132 } 3133 } 3134 PetscCall(VecRestoreArrayRead(xx, &x)); 3135 PetscCall(VecRestoreArray(zz, &z)); 3136 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3137 PetscFunctionReturn(0); 3138 } 3139 3140 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) { 3141 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3142 PetscInt totalnz = a->bs2 * a->nz; 3143 PetscScalar oalpha = alpha; 3144 PetscBLASInt one = 1, tnz; 3145 3146 PetscFunctionBegin; 3147 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3148 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3149 PetscCall(PetscLogFlops(totalnz)); 3150 PetscFunctionReturn(0); 3151 } 3152 3153 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) { 3154 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3155 MatScalar *v = a->a; 3156 PetscReal sum = 0.0; 3157 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3158 3159 PetscFunctionBegin; 3160 if (type == NORM_FROBENIUS) { 3161 #if defined(PETSC_USE_REAL___FP16) 3162 PetscBLASInt one = 1, cnt = bs2 * nz; 3163 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3164 #else 3165 for (i = 0; i < bs2 * nz; i++) { 3166 sum += PetscRealPart(PetscConj(*v) * (*v)); 3167 v++; 3168 } 3169 #endif 3170 *norm = PetscSqrtReal(sum); 3171 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3172 } else if (type == NORM_1) { /* maximum column sum */ 3173 PetscReal *tmp; 3174 PetscInt *bcol = a->j; 3175 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3176 for (i = 0; i < nz; i++) { 3177 for (j = 0; j < bs; j++) { 3178 k1 = bs * (*bcol) + j; /* column index */ 3179 for (k = 0; k < bs; k++) { 3180 tmp[k1] += PetscAbsScalar(*v); 3181 v++; 3182 } 3183 } 3184 bcol++; 3185 } 3186 *norm = 0.0; 3187 for (j = 0; j < A->cmap->n; j++) { 3188 if (tmp[j] > *norm) *norm = tmp[j]; 3189 } 3190 PetscCall(PetscFree(tmp)); 3191 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3192 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3193 *norm = 0.0; 3194 for (k = 0; k < bs; k++) { 3195 for (j = 0; j < a->mbs; j++) { 3196 v = a->a + bs2 * a->i[j] + k; 3197 sum = 0.0; 3198 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3199 for (k1 = 0; k1 < bs; k1++) { 3200 sum += PetscAbsScalar(*v); 3201 v += bs; 3202 } 3203 } 3204 if (sum > *norm) *norm = sum; 3205 } 3206 } 3207 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3208 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3209 PetscFunctionReturn(0); 3210 } 3211 3212 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) { 3213 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3214 3215 PetscFunctionBegin; 3216 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3217 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3218 *flg = PETSC_FALSE; 3219 PetscFunctionReturn(0); 3220 } 3221 3222 /* if the a->i are the same */ 3223 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3224 if (!*flg) PetscFunctionReturn(0); 3225 3226 /* if a->j are the same */ 3227 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3228 if (!*flg) PetscFunctionReturn(0); 3229 3230 /* if a->a are the same */ 3231 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3232 PetscFunctionReturn(0); 3233 } 3234 3235 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) { 3236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3237 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3238 PetscScalar *x, zero = 0.0; 3239 MatScalar *aa, *aa_j; 3240 3241 PetscFunctionBegin; 3242 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3243 bs = A->rmap->bs; 3244 aa = a->a; 3245 ai = a->i; 3246 aj = a->j; 3247 ambs = a->mbs; 3248 bs2 = a->bs2; 3249 3250 PetscCall(VecSet(v, zero)); 3251 PetscCall(VecGetArray(v, &x)); 3252 PetscCall(VecGetLocalSize(v, &n)); 3253 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3254 for (i = 0; i < ambs; i++) { 3255 for (j = ai[i]; j < ai[i + 1]; j++) { 3256 if (aj[j] == i) { 3257 row = i * bs; 3258 aa_j = aa + j * bs2; 3259 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3260 break; 3261 } 3262 } 3263 } 3264 PetscCall(VecRestoreArray(v, &x)); 3265 PetscFunctionReturn(0); 3266 } 3267 3268 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) { 3269 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3270 const PetscScalar *l, *r, *li, *ri; 3271 PetscScalar x; 3272 MatScalar *aa, *v; 3273 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3274 const PetscInt *ai, *aj; 3275 3276 PetscFunctionBegin; 3277 ai = a->i; 3278 aj = a->j; 3279 aa = a->a; 3280 m = A->rmap->n; 3281 n = A->cmap->n; 3282 bs = A->rmap->bs; 3283 mbs = a->mbs; 3284 bs2 = a->bs2; 3285 if (ll) { 3286 PetscCall(VecGetArrayRead(ll, &l)); 3287 PetscCall(VecGetLocalSize(ll, &lm)); 3288 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3289 for (i = 0; i < mbs; i++) { /* for each block row */ 3290 M = ai[i + 1] - ai[i]; 3291 li = l + i * bs; 3292 v = aa + bs2 * ai[i]; 3293 for (j = 0; j < M; j++) { /* for each block */ 3294 for (k = 0; k < bs2; k++) { (*v++) *= li[k % bs]; } 3295 } 3296 } 3297 PetscCall(VecRestoreArrayRead(ll, &l)); 3298 PetscCall(PetscLogFlops(a->nz)); 3299 } 3300 3301 if (rr) { 3302 PetscCall(VecGetArrayRead(rr, &r)); 3303 PetscCall(VecGetLocalSize(rr, &rn)); 3304 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3305 for (i = 0; i < mbs; i++) { /* for each block row */ 3306 iai = ai[i]; 3307 M = ai[i + 1] - iai; 3308 v = aa + bs2 * iai; 3309 for (j = 0; j < M; j++) { /* for each block */ 3310 ri = r + bs * aj[iai + j]; 3311 for (k = 0; k < bs; k++) { 3312 x = ri[k]; 3313 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3314 v += bs; 3315 } 3316 } 3317 } 3318 PetscCall(VecRestoreArrayRead(rr, &r)); 3319 PetscCall(PetscLogFlops(a->nz)); 3320 } 3321 PetscFunctionReturn(0); 3322 } 3323 3324 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) { 3325 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3326 3327 PetscFunctionBegin; 3328 info->block_size = a->bs2; 3329 info->nz_allocated = a->bs2 * a->maxnz; 3330 info->nz_used = a->bs2 * a->nz; 3331 info->nz_unneeded = info->nz_allocated - info->nz_used; 3332 info->assemblies = A->num_ass; 3333 info->mallocs = A->info.mallocs; 3334 info->memory = ((PetscObject)A)->mem; 3335 if (A->factortype) { 3336 info->fill_ratio_given = A->info.fill_ratio_given; 3337 info->fill_ratio_needed = A->info.fill_ratio_needed; 3338 info->factor_mallocs = A->info.factor_mallocs; 3339 } else { 3340 info->fill_ratio_given = 0; 3341 info->fill_ratio_needed = 0; 3342 info->factor_mallocs = 0; 3343 } 3344 PetscFunctionReturn(0); 3345 } 3346 3347 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) { 3348 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3349 3350 PetscFunctionBegin; 3351 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3352 PetscFunctionReturn(0); 3353 } 3354 3355 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 3356 PetscFunctionBegin; 3357 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3358 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3359 PetscFunctionReturn(0); 3360 } 3361 3362 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3363 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3364 PetscScalar *z = NULL, sum1; 3365 const PetscScalar *xb; 3366 PetscScalar x1; 3367 const MatScalar *v, *vv; 3368 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3369 PetscBool usecprow = a->compressedrow.use; 3370 3371 PetscFunctionBegin; 3372 idx = a->j; 3373 v = a->a; 3374 if (usecprow) { 3375 mbs = a->compressedrow.nrows; 3376 ii = a->compressedrow.i; 3377 ridx = a->compressedrow.rindex; 3378 } else { 3379 mbs = a->mbs; 3380 ii = a->i; 3381 z = c; 3382 } 3383 3384 for (i = 0; i < mbs; i++) { 3385 n = ii[1] - ii[0]; 3386 ii++; 3387 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3388 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3389 if (usecprow) z = c + ridx[i]; 3390 jj = idx; 3391 vv = v; 3392 for (k = 0; k < cn; k++) { 3393 idx = jj; 3394 v = vv; 3395 sum1 = 0.0; 3396 for (j = 0; j < n; j++) { 3397 xb = b + (*idx++); 3398 x1 = xb[0 + k * bm]; 3399 sum1 += v[0] * x1; 3400 v += 1; 3401 } 3402 z[0 + k * cm] = sum1; 3403 } 3404 if (!usecprow) z += 1; 3405 } 3406 PetscFunctionReturn(0); 3407 } 3408 3409 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3410 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3411 PetscScalar *z = NULL, sum1, sum2; 3412 const PetscScalar *xb; 3413 PetscScalar x1, x2; 3414 const MatScalar *v, *vv; 3415 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3416 PetscBool usecprow = a->compressedrow.use; 3417 3418 PetscFunctionBegin; 3419 idx = a->j; 3420 v = a->a; 3421 if (usecprow) { 3422 mbs = a->compressedrow.nrows; 3423 ii = a->compressedrow.i; 3424 ridx = a->compressedrow.rindex; 3425 } else { 3426 mbs = a->mbs; 3427 ii = a->i; 3428 z = c; 3429 } 3430 3431 for (i = 0; i < mbs; i++) { 3432 n = ii[1] - ii[0]; 3433 ii++; 3434 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3435 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3436 if (usecprow) z = c + 2 * ridx[i]; 3437 jj = idx; 3438 vv = v; 3439 for (k = 0; k < cn; k++) { 3440 idx = jj; 3441 v = vv; 3442 sum1 = 0.0; 3443 sum2 = 0.0; 3444 for (j = 0; j < n; j++) { 3445 xb = b + 2 * (*idx++); 3446 x1 = xb[0 + k * bm]; 3447 x2 = xb[1 + k * bm]; 3448 sum1 += v[0] * x1 + v[2] * x2; 3449 sum2 += v[1] * x1 + v[3] * x2; 3450 v += 4; 3451 } 3452 z[0 + k * cm] = sum1; 3453 z[1 + k * cm] = sum2; 3454 } 3455 if (!usecprow) z += 2; 3456 } 3457 PetscFunctionReturn(0); 3458 } 3459 3460 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3461 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3462 PetscScalar *z = NULL, sum1, sum2, sum3; 3463 const PetscScalar *xb; 3464 PetscScalar x1, x2, x3; 3465 const MatScalar *v, *vv; 3466 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3467 PetscBool usecprow = a->compressedrow.use; 3468 3469 PetscFunctionBegin; 3470 idx = a->j; 3471 v = a->a; 3472 if (usecprow) { 3473 mbs = a->compressedrow.nrows; 3474 ii = a->compressedrow.i; 3475 ridx = a->compressedrow.rindex; 3476 } else { 3477 mbs = a->mbs; 3478 ii = a->i; 3479 z = c; 3480 } 3481 3482 for (i = 0; i < mbs; i++) { 3483 n = ii[1] - ii[0]; 3484 ii++; 3485 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3486 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3487 if (usecprow) z = c + 3 * ridx[i]; 3488 jj = idx; 3489 vv = v; 3490 for (k = 0; k < cn; k++) { 3491 idx = jj; 3492 v = vv; 3493 sum1 = 0.0; 3494 sum2 = 0.0; 3495 sum3 = 0.0; 3496 for (j = 0; j < n; j++) { 3497 xb = b + 3 * (*idx++); 3498 x1 = xb[0 + k * bm]; 3499 x2 = xb[1 + k * bm]; 3500 x3 = xb[2 + k * bm]; 3501 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3502 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3503 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3504 v += 9; 3505 } 3506 z[0 + k * cm] = sum1; 3507 z[1 + k * cm] = sum2; 3508 z[2 + k * cm] = sum3; 3509 } 3510 if (!usecprow) z += 3; 3511 } 3512 PetscFunctionReturn(0); 3513 } 3514 3515 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3517 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3518 const PetscScalar *xb; 3519 PetscScalar x1, x2, x3, x4; 3520 const MatScalar *v, *vv; 3521 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3522 PetscBool usecprow = a->compressedrow.use; 3523 3524 PetscFunctionBegin; 3525 idx = a->j; 3526 v = a->a; 3527 if (usecprow) { 3528 mbs = a->compressedrow.nrows; 3529 ii = a->compressedrow.i; 3530 ridx = a->compressedrow.rindex; 3531 } else { 3532 mbs = a->mbs; 3533 ii = a->i; 3534 z = c; 3535 } 3536 3537 for (i = 0; i < mbs; i++) { 3538 n = ii[1] - ii[0]; 3539 ii++; 3540 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3541 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3542 if (usecprow) z = c + 4 * ridx[i]; 3543 jj = idx; 3544 vv = v; 3545 for (k = 0; k < cn; k++) { 3546 idx = jj; 3547 v = vv; 3548 sum1 = 0.0; 3549 sum2 = 0.0; 3550 sum3 = 0.0; 3551 sum4 = 0.0; 3552 for (j = 0; j < n; j++) { 3553 xb = b + 4 * (*idx++); 3554 x1 = xb[0 + k * bm]; 3555 x2 = xb[1 + k * bm]; 3556 x3 = xb[2 + k * bm]; 3557 x4 = xb[3 + k * bm]; 3558 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3559 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3560 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3561 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3562 v += 16; 3563 } 3564 z[0 + k * cm] = sum1; 3565 z[1 + k * cm] = sum2; 3566 z[2 + k * cm] = sum3; 3567 z[3 + k * cm] = sum4; 3568 } 3569 if (!usecprow) z += 4; 3570 } 3571 PetscFunctionReturn(0); 3572 } 3573 3574 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3575 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3576 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3577 const PetscScalar *xb; 3578 PetscScalar x1, x2, x3, x4, x5; 3579 const MatScalar *v, *vv; 3580 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3581 PetscBool usecprow = a->compressedrow.use; 3582 3583 PetscFunctionBegin; 3584 idx = a->j; 3585 v = a->a; 3586 if (usecprow) { 3587 mbs = a->compressedrow.nrows; 3588 ii = a->compressedrow.i; 3589 ridx = a->compressedrow.rindex; 3590 } else { 3591 mbs = a->mbs; 3592 ii = a->i; 3593 z = c; 3594 } 3595 3596 for (i = 0; i < mbs; i++) { 3597 n = ii[1] - ii[0]; 3598 ii++; 3599 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3600 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3601 if (usecprow) z = c + 5 * ridx[i]; 3602 jj = idx; 3603 vv = v; 3604 for (k = 0; k < cn; k++) { 3605 idx = jj; 3606 v = vv; 3607 sum1 = 0.0; 3608 sum2 = 0.0; 3609 sum3 = 0.0; 3610 sum4 = 0.0; 3611 sum5 = 0.0; 3612 for (j = 0; j < n; j++) { 3613 xb = b + 5 * (*idx++); 3614 x1 = xb[0 + k * bm]; 3615 x2 = xb[1 + k * bm]; 3616 x3 = xb[2 + k * bm]; 3617 x4 = xb[3 + k * bm]; 3618 x5 = xb[4 + k * bm]; 3619 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3620 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3621 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3622 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3623 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3624 v += 25; 3625 } 3626 z[0 + k * cm] = sum1; 3627 z[1 + k * cm] = sum2; 3628 z[2 + k * cm] = sum3; 3629 z[3 + k * cm] = sum4; 3630 z[4 + k * cm] = sum5; 3631 } 3632 if (!usecprow) z += 5; 3633 } 3634 PetscFunctionReturn(0); 3635 } 3636 3637 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) { 3638 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3639 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3640 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3641 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3642 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3643 PetscBLASInt bbs, bcn, bbm, bcm; 3644 PetscScalar *z = NULL; 3645 PetscScalar *c, *b; 3646 const MatScalar *v; 3647 const PetscInt *idx, *ii, *ridx = NULL; 3648 PetscScalar _DZero = 0.0, _DOne = 1.0; 3649 PetscBool usecprow = a->compressedrow.use; 3650 3651 PetscFunctionBegin; 3652 if (!cm || !cn) PetscFunctionReturn(0); 3653 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3654 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3655 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3656 b = bd->v; 3657 if (a->nonzerorowcnt != A->rmap->n) { PetscCall(MatZeroEntries(C)); } 3658 PetscCall(MatDenseGetArray(C, &c)); 3659 switch (bs) { 3660 case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; 3661 case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; 3662 case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; 3663 case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; 3664 case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; 3665 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3666 PetscCall(PetscBLASIntCast(bs, &bbs)); 3667 PetscCall(PetscBLASIntCast(cn, &bcn)); 3668 PetscCall(PetscBLASIntCast(bm, &bbm)); 3669 PetscCall(PetscBLASIntCast(cm, &bcm)); 3670 idx = a->j; 3671 v = a->a; 3672 if (usecprow) { 3673 mbs = a->compressedrow.nrows; 3674 ii = a->compressedrow.i; 3675 ridx = a->compressedrow.rindex; 3676 } else { 3677 mbs = a->mbs; 3678 ii = a->i; 3679 z = c; 3680 } 3681 for (i = 0; i < mbs; i++) { 3682 n = ii[1] - ii[0]; 3683 ii++; 3684 if (usecprow) z = c + bs * ridx[i]; 3685 if (n) { 3686 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3687 v += bs2; 3688 } 3689 for (j = 1; j < n; j++) { 3690 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3691 v += bs2; 3692 } 3693 if (!usecprow) z += bs; 3694 } 3695 } 3696 PetscCall(MatDenseRestoreArray(C, &c)); 3697 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3698 PetscFunctionReturn(0); 3699 } 3700