1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 14 const PetscInt *idx; 15 PetscInt start, end, *ai, *aj, bs, *nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 25 26 PetscCall(PetscBTCreate(m, &table)); 27 PetscCall(PetscMalloc1(m + 1, &nidx)); 28 PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 29 30 for (i = 0; i < is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 PetscCall(PetscBTMemzero(m, table)); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 PetscCall(ISGetIndices(is[i], &idx)); 37 PetscCall(ISGetLocalSize(is[i], &n)); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j = 0; j < n; ++j) { 41 ival = idx[j] / bs; /* convert the indices into block indices */ 42 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 43 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44 } 45 PetscCall(ISRestoreIndices(is[i], &idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j = 0; j < ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row + 1]; 55 for (l = start; l < end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j = 0; j < isz; j++) { 63 for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 64 } 65 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 66 } 67 PetscCall(PetscBTDestroy(&table)); 68 PetscCall(PetscFree(nidx)); 69 PetscCall(PetscFree(nidx2)); 70 PetscFunctionReturn(0); 71 } 72 73 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 74 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 75 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 76 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 77 const PetscInt *irow, *icol; 78 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 79 PetscInt *aj = a->j, *ai = a->i; 80 MatScalar *mat_a; 81 Mat C; 82 PetscBool flag; 83 84 PetscFunctionBegin; 85 PetscCall(ISGetIndices(isrow, &irow)); 86 PetscCall(ISGetIndices(iscol, &icol)); 87 PetscCall(ISGetLocalSize(isrow, &nrows)); 88 PetscCall(ISGetLocalSize(iscol, &ncols)); 89 90 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 91 ssmap = smap; 92 PetscCall(PetscMalloc1(1 + nrows, &lens)); 93 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 94 /* determine lens of each row */ 95 for (i = 0; i < nrows; i++) { 96 kstart = ai[irow[i]]; 97 kend = kstart + a->ilen[irow[i]]; 98 lens[i] = 0; 99 for (k = kstart; k < kend; k++) { 100 if (ssmap[aj[k]]) lens[i]++; 101 } 102 } 103 /* Create and fill new matrix */ 104 if (scall == MAT_REUSE_MATRIX) { 105 c = (Mat_SeqBAIJ *)((*B)->data); 106 107 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 108 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 109 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 110 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 111 C = *B; 112 } else { 113 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 114 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 115 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 116 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 117 } 118 c = (Mat_SeqBAIJ *)(C->data); 119 for (i = 0; i < nrows; i++) { 120 row = irow[i]; 121 kstart = ai[row]; 122 kend = kstart + a->ilen[row]; 123 mat_i = c->i[i]; 124 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 125 mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 126 mat_ilen = c->ilen + i; 127 for (k = kstart; k < kend; k++) { 128 if ((tcol = ssmap[a->j[k]])) { 129 *mat_j++ = tcol - 1; 130 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 131 mat_a += bs2; 132 (*mat_ilen)++; 133 } 134 } 135 } 136 /* sort */ 137 if (c->j && c->a) { 138 MatScalar *work; 139 PetscCall(PetscMalloc1(bs2, &work)); 140 for (i = 0; i < nrows; i++) { 141 PetscInt ilen; 142 mat_i = c->i[i]; 143 mat_j = c->j + mat_i; 144 mat_a = c->a + mat_i * bs2; 145 ilen = c->ilen[i]; 146 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 147 } 148 PetscCall(PetscFree(work)); 149 } 150 151 /* Free work space */ 152 PetscCall(ISRestoreIndices(iscol, &icol)); 153 PetscCall(PetscFree(smap)); 154 PetscCall(PetscFree(lens)); 155 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 156 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 157 158 PetscCall(ISRestoreIndices(isrow, &irow)); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 164 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 165 IS is1, is2; 166 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 167 const PetscInt *irow, *icol; 168 169 PetscFunctionBegin; 170 PetscCall(ISGetIndices(isrow, &irow)); 171 PetscCall(ISGetIndices(iscol, &icol)); 172 PetscCall(ISGetLocalSize(isrow, &nrows)); 173 PetscCall(ISGetLocalSize(iscol, &ncols)); 174 175 /* Verify if the indices corespond to each element in a block 176 and form the IS with compressed IS */ 177 maxmnbs = PetscMax(a->mbs, a->nbs); 178 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 179 PetscCall(PetscArrayzero(vary, a->mbs)); 180 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 181 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 182 count = 0; 183 for (i = 0; i < nrows; i++) { 184 j = irow[i] / bs; 185 if ((vary[j]--) == bs) iary[count++] = j; 186 } 187 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 188 189 PetscCall(PetscArrayzero(vary, a->nbs)); 190 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 191 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 192 count = 0; 193 for (i = 0; i < ncols; i++) { 194 j = icol[i] / bs; 195 if ((vary[j]--) == bs) iary[count++] = j; 196 } 197 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 198 PetscCall(ISRestoreIndices(isrow, &irow)); 199 PetscCall(ISRestoreIndices(iscol, &icol)); 200 PetscCall(PetscFree2(vary, iary)); 201 202 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 203 PetscCall(ISDestroy(&is1)); 204 PetscCall(ISDestroy(&is2)); 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) { 209 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 210 Mat_SubSppt *submatj = c->submatis1; 211 212 PetscFunctionBegin; 213 PetscCall((*submatj->destroy)(C)); 214 PetscCall(MatDestroySubMatrix_Private(submatj)); 215 PetscFunctionReturn(0); 216 } 217 218 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 219 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) { 220 PetscInt i; 221 Mat C; 222 Mat_SeqBAIJ *c; 223 Mat_SubSppt *submatj; 224 225 PetscFunctionBegin; 226 for (i = 0; i < n; i++) { 227 C = (*mat)[i]; 228 c = (Mat_SeqBAIJ *)C->data; 229 submatj = c->submatis1; 230 if (submatj) { 231 if (--((PetscObject)C)->refct <= 0) { 232 PetscCall(PetscFree(C->factorprefix)); 233 PetscCall((*submatj->destroy)(C)); 234 PetscCall(MatDestroySubMatrix_Private(submatj)); 235 PetscCall(PetscFree(C->defaultvectype)); 236 PetscCall(PetscFree(C->defaultrandtype)); 237 PetscCall(PetscLayoutDestroy(&C->rmap)); 238 PetscCall(PetscLayoutDestroy(&C->cmap)); 239 PetscCall(PetscHeaderDestroy(&C)); 240 } 241 } else { 242 PetscCall(MatDestroy(&C)); 243 } 244 } 245 246 /* Destroy Dummy submatrices created for reuse */ 247 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 248 249 PetscCall(PetscFree(*mat)); 250 PetscFunctionReturn(0); 251 } 252 253 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { 254 PetscInt i; 255 256 PetscFunctionBegin; 257 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 258 259 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 260 PetscFunctionReturn(0); 261 } 262 263 /* -------------------------------------------------------*/ 264 /* Should check that shapes of vectors and matrices match */ 265 /* -------------------------------------------------------*/ 266 267 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) { 268 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 269 PetscScalar *z, sum; 270 const PetscScalar *x; 271 const MatScalar *v; 272 PetscInt mbs, i, n; 273 const PetscInt *idx, *ii, *ridx = NULL; 274 PetscBool usecprow = a->compressedrow.use; 275 276 PetscFunctionBegin; 277 PetscCall(VecGetArrayRead(xx, &x)); 278 PetscCall(VecGetArrayWrite(zz, &z)); 279 280 if (usecprow) { 281 mbs = a->compressedrow.nrows; 282 ii = a->compressedrow.i; 283 ridx = a->compressedrow.rindex; 284 PetscCall(PetscArrayzero(z, a->mbs)); 285 } else { 286 mbs = a->mbs; 287 ii = a->i; 288 } 289 290 for (i = 0; i < mbs; i++) { 291 n = ii[1] - ii[0]; 292 v = a->a + ii[0]; 293 idx = a->j + ii[0]; 294 ii++; 295 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 296 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 297 sum = 0.0; 298 PetscSparseDensePlusDot(sum, x, v, idx, n); 299 if (usecprow) { 300 z[ridx[i]] = sum; 301 } else { 302 z[i] = sum; 303 } 304 } 305 PetscCall(VecRestoreArrayRead(xx, &x)); 306 PetscCall(VecRestoreArrayWrite(zz, &z)); 307 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 308 PetscFunctionReturn(0); 309 } 310 311 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) { 312 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 313 PetscScalar *z = NULL, sum1, sum2, *zarray; 314 const PetscScalar *x, *xb; 315 PetscScalar x1, x2; 316 const MatScalar *v; 317 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 318 PetscBool usecprow = a->compressedrow.use; 319 320 PetscFunctionBegin; 321 PetscCall(VecGetArrayRead(xx, &x)); 322 PetscCall(VecGetArrayWrite(zz, &zarray)); 323 324 idx = a->j; 325 v = a->a; 326 if (usecprow) { 327 mbs = a->compressedrow.nrows; 328 ii = a->compressedrow.i; 329 ridx = a->compressedrow.rindex; 330 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 331 } else { 332 mbs = a->mbs; 333 ii = a->i; 334 z = zarray; 335 } 336 337 for (i = 0; i < mbs; i++) { 338 n = ii[1] - ii[0]; 339 ii++; 340 sum1 = 0.0; 341 sum2 = 0.0; 342 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 343 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 344 for (j = 0; j < n; j++) { 345 xb = x + 2 * (*idx++); 346 x1 = xb[0]; 347 x2 = xb[1]; 348 sum1 += v[0] * x1 + v[2] * x2; 349 sum2 += v[1] * x1 + v[3] * x2; 350 v += 4; 351 } 352 if (usecprow) z = zarray + 2 * ridx[i]; 353 z[0] = sum1; 354 z[1] = sum2; 355 if (!usecprow) z += 2; 356 } 357 PetscCall(VecRestoreArrayRead(xx, &x)); 358 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 359 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 360 PetscFunctionReturn(0); 361 } 362 363 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) { 364 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 365 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 366 const PetscScalar *x, *xb; 367 const MatScalar *v; 368 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 369 PetscBool usecprow = a->compressedrow.use; 370 371 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 372 #pragma disjoint(*v, *z, *xb) 373 #endif 374 375 PetscFunctionBegin; 376 PetscCall(VecGetArrayRead(xx, &x)); 377 PetscCall(VecGetArrayWrite(zz, &zarray)); 378 379 idx = a->j; 380 v = a->a; 381 if (usecprow) { 382 mbs = a->compressedrow.nrows; 383 ii = a->compressedrow.i; 384 ridx = a->compressedrow.rindex; 385 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 386 } else { 387 mbs = a->mbs; 388 ii = a->i; 389 z = zarray; 390 } 391 392 for (i = 0; i < mbs; i++) { 393 n = ii[1] - ii[0]; 394 ii++; 395 sum1 = 0.0; 396 sum2 = 0.0; 397 sum3 = 0.0; 398 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 399 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 400 for (j = 0; j < n; j++) { 401 xb = x + 3 * (*idx++); 402 x1 = xb[0]; 403 x2 = xb[1]; 404 x3 = xb[2]; 405 406 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 407 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 408 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 409 v += 9; 410 } 411 if (usecprow) z = zarray + 3 * ridx[i]; 412 z[0] = sum1; 413 z[1] = sum2; 414 z[2] = sum3; 415 if (!usecprow) z += 3; 416 } 417 PetscCall(VecRestoreArrayRead(xx, &x)); 418 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 419 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 420 PetscFunctionReturn(0); 421 } 422 423 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) { 424 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 425 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 426 const PetscScalar *x, *xb; 427 const MatScalar *v; 428 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 429 PetscBool usecprow = a->compressedrow.use; 430 431 PetscFunctionBegin; 432 PetscCall(VecGetArrayRead(xx, &x)); 433 PetscCall(VecGetArrayWrite(zz, &zarray)); 434 435 idx = a->j; 436 v = a->a; 437 if (usecprow) { 438 mbs = a->compressedrow.nrows; 439 ii = a->compressedrow.i; 440 ridx = a->compressedrow.rindex; 441 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 442 } else { 443 mbs = a->mbs; 444 ii = a->i; 445 z = zarray; 446 } 447 448 for (i = 0; i < mbs; i++) { 449 n = ii[1] - ii[0]; 450 ii++; 451 sum1 = 0.0; 452 sum2 = 0.0; 453 sum3 = 0.0; 454 sum4 = 0.0; 455 456 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 457 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 458 for (j = 0; j < n; j++) { 459 xb = x + 4 * (*idx++); 460 x1 = xb[0]; 461 x2 = xb[1]; 462 x3 = xb[2]; 463 x4 = xb[3]; 464 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 465 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 466 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 467 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 468 v += 16; 469 } 470 if (usecprow) z = zarray + 4 * ridx[i]; 471 z[0] = sum1; 472 z[1] = sum2; 473 z[2] = sum3; 474 z[3] = sum4; 475 if (!usecprow) z += 4; 476 } 477 PetscCall(VecRestoreArrayRead(xx, &x)); 478 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 479 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 480 PetscFunctionReturn(0); 481 } 482 483 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) { 484 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 485 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 486 const PetscScalar *xb, *x; 487 const MatScalar *v; 488 const PetscInt *idx, *ii, *ridx = NULL; 489 PetscInt mbs, i, j, n; 490 PetscBool usecprow = a->compressedrow.use; 491 492 PetscFunctionBegin; 493 PetscCall(VecGetArrayRead(xx, &x)); 494 PetscCall(VecGetArrayWrite(zz, &zarray)); 495 496 idx = a->j; 497 v = a->a; 498 if (usecprow) { 499 mbs = a->compressedrow.nrows; 500 ii = a->compressedrow.i; 501 ridx = a->compressedrow.rindex; 502 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 503 } else { 504 mbs = a->mbs; 505 ii = a->i; 506 z = zarray; 507 } 508 509 for (i = 0; i < mbs; i++) { 510 n = ii[1] - ii[0]; 511 ii++; 512 sum1 = 0.0; 513 sum2 = 0.0; 514 sum3 = 0.0; 515 sum4 = 0.0; 516 sum5 = 0.0; 517 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 518 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 519 for (j = 0; j < n; j++) { 520 xb = x + 5 * (*idx++); 521 x1 = xb[0]; 522 x2 = xb[1]; 523 x3 = xb[2]; 524 x4 = xb[3]; 525 x5 = xb[4]; 526 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 527 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 528 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 529 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 530 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 531 v += 25; 532 } 533 if (usecprow) z = zarray + 5 * ridx[i]; 534 z[0] = sum1; 535 z[1] = sum2; 536 z[2] = sum3; 537 z[3] = sum4; 538 z[4] = sum5; 539 if (!usecprow) z += 5; 540 } 541 PetscCall(VecRestoreArrayRead(xx, &x)); 542 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 543 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 544 PetscFunctionReturn(0); 545 } 546 547 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) { 548 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 549 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 550 const PetscScalar *x, *xb; 551 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 552 const MatScalar *v; 553 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 554 PetscBool usecprow = a->compressedrow.use; 555 556 PetscFunctionBegin; 557 PetscCall(VecGetArrayRead(xx, &x)); 558 PetscCall(VecGetArrayWrite(zz, &zarray)); 559 560 idx = a->j; 561 v = a->a; 562 if (usecprow) { 563 mbs = a->compressedrow.nrows; 564 ii = a->compressedrow.i; 565 ridx = a->compressedrow.rindex; 566 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 567 } else { 568 mbs = a->mbs; 569 ii = a->i; 570 z = zarray; 571 } 572 573 for (i = 0; i < mbs; i++) { 574 n = ii[1] - ii[0]; 575 ii++; 576 sum1 = 0.0; 577 sum2 = 0.0; 578 sum3 = 0.0; 579 sum4 = 0.0; 580 sum5 = 0.0; 581 sum6 = 0.0; 582 583 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 584 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 585 for (j = 0; j < n; j++) { 586 xb = x + 6 * (*idx++); 587 x1 = xb[0]; 588 x2 = xb[1]; 589 x3 = xb[2]; 590 x4 = xb[3]; 591 x5 = xb[4]; 592 x6 = xb[5]; 593 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 594 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 595 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 596 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 597 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 598 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 599 v += 36; 600 } 601 if (usecprow) z = zarray + 6 * ridx[i]; 602 z[0] = sum1; 603 z[1] = sum2; 604 z[2] = sum3; 605 z[3] = sum4; 606 z[4] = sum5; 607 z[5] = sum6; 608 if (!usecprow) z += 6; 609 } 610 611 PetscCall(VecRestoreArrayRead(xx, &x)); 612 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 613 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 614 PetscFunctionReturn(0); 615 } 616 617 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) { 618 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 619 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 620 const PetscScalar *x, *xb; 621 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 622 const MatScalar *v; 623 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 624 PetscBool usecprow = a->compressedrow.use; 625 626 PetscFunctionBegin; 627 PetscCall(VecGetArrayRead(xx, &x)); 628 PetscCall(VecGetArrayWrite(zz, &zarray)); 629 630 idx = a->j; 631 v = a->a; 632 if (usecprow) { 633 mbs = a->compressedrow.nrows; 634 ii = a->compressedrow.i; 635 ridx = a->compressedrow.rindex; 636 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 637 } else { 638 mbs = a->mbs; 639 ii = a->i; 640 z = zarray; 641 } 642 643 for (i = 0; i < mbs; i++) { 644 n = ii[1] - ii[0]; 645 ii++; 646 sum1 = 0.0; 647 sum2 = 0.0; 648 sum3 = 0.0; 649 sum4 = 0.0; 650 sum5 = 0.0; 651 sum6 = 0.0; 652 sum7 = 0.0; 653 654 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 655 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 656 for (j = 0; j < n; j++) { 657 xb = x + 7 * (*idx++); 658 x1 = xb[0]; 659 x2 = xb[1]; 660 x3 = xb[2]; 661 x4 = xb[3]; 662 x5 = xb[4]; 663 x6 = xb[5]; 664 x7 = xb[6]; 665 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 666 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 667 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 668 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 669 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 670 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 671 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 672 v += 49; 673 } 674 if (usecprow) z = zarray + 7 * ridx[i]; 675 z[0] = sum1; 676 z[1] = sum2; 677 z[2] = sum3; 678 z[3] = sum4; 679 z[4] = sum5; 680 z[5] = sum6; 681 z[6] = sum7; 682 if (!usecprow) z += 7; 683 } 684 685 PetscCall(VecRestoreArrayRead(xx, &x)); 686 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 687 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 688 PetscFunctionReturn(0); 689 } 690 691 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 692 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) { 693 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 694 PetscScalar *z = NULL, *work, *workt, *zarray; 695 const PetscScalar *x, *xb; 696 const MatScalar *v; 697 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 698 const PetscInt *idx, *ii, *ridx = NULL; 699 PetscInt k; 700 PetscBool usecprow = a->compressedrow.use; 701 702 __m256d a0, a1, a2, a3, a4, a5; 703 __m256d w0, w1, w2, w3; 704 __m256d z0, z1, z2; 705 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 706 707 PetscFunctionBegin; 708 PetscCall(VecGetArrayRead(xx, &x)); 709 PetscCall(VecGetArrayWrite(zz, &zarray)); 710 711 idx = a->j; 712 v = a->a; 713 if (usecprow) { 714 mbs = a->compressedrow.nrows; 715 ii = a->compressedrow.i; 716 ridx = a->compressedrow.rindex; 717 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 718 } else { 719 mbs = a->mbs; 720 ii = a->i; 721 z = zarray; 722 } 723 724 if (!a->mult_work) { 725 k = PetscMax(A->rmap->n, A->cmap->n); 726 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 727 } 728 729 work = a->mult_work; 730 for (i = 0; i < mbs; i++) { 731 n = ii[1] - ii[0]; 732 ii++; 733 workt = work; 734 for (j = 0; j < n; j++) { 735 xb = x + bs * (*idx++); 736 for (k = 0; k < bs; k++) workt[k] = xb[k]; 737 workt += bs; 738 } 739 if (usecprow) z = zarray + bs * ridx[i]; 740 741 z0 = _mm256_setzero_pd(); 742 z1 = _mm256_setzero_pd(); 743 z2 = _mm256_setzero_pd(); 744 745 for (j = 0; j < n; j++) { 746 /* first column of a */ 747 w0 = _mm256_set1_pd(work[j * 9]); 748 a0 = _mm256_loadu_pd(&v[j * 81]); 749 z0 = _mm256_fmadd_pd(a0, w0, z0); 750 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 751 z1 = _mm256_fmadd_pd(a1, w0, z1); 752 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 753 z2 = _mm256_fmadd_pd(a2, w0, z2); 754 755 /* second column of a */ 756 w1 = _mm256_set1_pd(work[j * 9 + 1]); 757 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 758 z0 = _mm256_fmadd_pd(a0, w1, z0); 759 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 760 z1 = _mm256_fmadd_pd(a1, w1, z1); 761 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 762 z2 = _mm256_fmadd_pd(a2, w1, z2); 763 764 /* third column of a */ 765 w2 = _mm256_set1_pd(work[j * 9 + 2]); 766 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 767 z0 = _mm256_fmadd_pd(a3, w2, z0); 768 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 769 z1 = _mm256_fmadd_pd(a4, w2, z1); 770 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 771 z2 = _mm256_fmadd_pd(a5, w2, z2); 772 773 /* fourth column of a */ 774 w3 = _mm256_set1_pd(work[j * 9 + 3]); 775 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 776 z0 = _mm256_fmadd_pd(a0, w3, z0); 777 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 778 z1 = _mm256_fmadd_pd(a1, w3, z1); 779 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 780 z2 = _mm256_fmadd_pd(a2, w3, z2); 781 782 /* fifth column of a */ 783 w0 = _mm256_set1_pd(work[j * 9 + 4]); 784 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 785 z0 = _mm256_fmadd_pd(a3, w0, z0); 786 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 787 z1 = _mm256_fmadd_pd(a4, w0, z1); 788 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 789 z2 = _mm256_fmadd_pd(a5, w0, z2); 790 791 /* sixth column of a */ 792 w1 = _mm256_set1_pd(work[j * 9 + 5]); 793 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 794 z0 = _mm256_fmadd_pd(a0, w1, z0); 795 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 796 z1 = _mm256_fmadd_pd(a1, w1, z1); 797 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 798 z2 = _mm256_fmadd_pd(a2, w1, z2); 799 800 /* seventh column of a */ 801 w2 = _mm256_set1_pd(work[j * 9 + 6]); 802 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 803 z0 = _mm256_fmadd_pd(a0, w2, z0); 804 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 805 z1 = _mm256_fmadd_pd(a1, w2, z1); 806 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 807 z2 = _mm256_fmadd_pd(a2, w2, z2); 808 809 /* eighth column of a */ 810 w3 = _mm256_set1_pd(work[j * 9 + 7]); 811 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 812 z0 = _mm256_fmadd_pd(a3, w3, z0); 813 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 814 z1 = _mm256_fmadd_pd(a4, w3, z1); 815 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 816 z2 = _mm256_fmadd_pd(a5, w3, z2); 817 818 /* ninth column of a */ 819 w0 = _mm256_set1_pd(work[j * 9 + 8]); 820 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 821 z0 = _mm256_fmadd_pd(a0, w0, z0); 822 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 823 z1 = _mm256_fmadd_pd(a1, w0, z1); 824 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 825 z2 = _mm256_fmadd_pd(a2, w0, z2); 826 } 827 828 _mm256_storeu_pd(&z[0], z0); 829 _mm256_storeu_pd(&z[4], z1); 830 _mm256_maskstore_pd(&z[8], mask1, z2); 831 832 v += n * bs2; 833 if (!usecprow) z += bs; 834 } 835 PetscCall(VecRestoreArrayRead(xx, &x)); 836 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 837 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 838 PetscFunctionReturn(0); 839 } 840 #endif 841 842 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) { 843 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 844 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 845 const PetscScalar *x, *xb; 846 PetscScalar *zarray, xv; 847 const MatScalar *v; 848 const PetscInt *ii, *ij = a->j, *idx; 849 PetscInt mbs, i, j, k, n, *ridx = NULL; 850 PetscBool usecprow = a->compressedrow.use; 851 852 PetscFunctionBegin; 853 PetscCall(VecGetArrayRead(xx, &x)); 854 PetscCall(VecGetArrayWrite(zz, &zarray)); 855 856 v = a->a; 857 if (usecprow) { 858 mbs = a->compressedrow.nrows; 859 ii = a->compressedrow.i; 860 ridx = a->compressedrow.rindex; 861 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 862 } else { 863 mbs = a->mbs; 864 ii = a->i; 865 z = zarray; 866 } 867 868 for (i = 0; i < mbs; i++) { 869 n = ii[i + 1] - ii[i]; 870 idx = ij + ii[i]; 871 sum1 = 0.0; 872 sum2 = 0.0; 873 sum3 = 0.0; 874 sum4 = 0.0; 875 sum5 = 0.0; 876 sum6 = 0.0; 877 sum7 = 0.0; 878 sum8 = 0.0; 879 sum9 = 0.0; 880 sum10 = 0.0; 881 sum11 = 0.0; 882 883 for (j = 0; j < n; j++) { 884 xb = x + 11 * (idx[j]); 885 886 for (k = 0; k < 11; k++) { 887 xv = xb[k]; 888 sum1 += v[0] * xv; 889 sum2 += v[1] * xv; 890 sum3 += v[2] * xv; 891 sum4 += v[3] * xv; 892 sum5 += v[4] * xv; 893 sum6 += v[5] * xv; 894 sum7 += v[6] * xv; 895 sum8 += v[7] * xv; 896 sum9 += v[8] * xv; 897 sum10 += v[9] * xv; 898 sum11 += v[10] * xv; 899 v += 11; 900 } 901 } 902 if (usecprow) z = zarray + 11 * ridx[i]; 903 z[0] = sum1; 904 z[1] = sum2; 905 z[2] = sum3; 906 z[3] = sum4; 907 z[4] = sum5; 908 z[5] = sum6; 909 z[6] = sum7; 910 z[7] = sum8; 911 z[8] = sum9; 912 z[9] = sum10; 913 z[10] = sum11; 914 915 if (!usecprow) z += 11; 916 } 917 918 PetscCall(VecRestoreArrayRead(xx, &x)); 919 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 920 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 921 PetscFunctionReturn(0); 922 } 923 924 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 925 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) { 926 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 927 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 928 const PetscScalar *x, *xb; 929 PetscScalar *zarray, xv; 930 const MatScalar *v; 931 const PetscInt *ii, *ij = a->j, *idx; 932 PetscInt mbs, i, j, k, n, *ridx = NULL; 933 PetscBool usecprow = a->compressedrow.use; 934 935 PetscFunctionBegin; 936 PetscCall(VecGetArrayRead(xx, &x)); 937 PetscCall(VecGetArrayWrite(zz, &zarray)); 938 939 v = a->a; 940 if (usecprow) { 941 mbs = a->compressedrow.nrows; 942 ii = a->compressedrow.i; 943 ridx = a->compressedrow.rindex; 944 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 945 } else { 946 mbs = a->mbs; 947 ii = a->i; 948 z = zarray; 949 } 950 951 for (i = 0; i < mbs; i++) { 952 n = ii[i + 1] - ii[i]; 953 idx = ij + ii[i]; 954 sum1 = 0.0; 955 sum2 = 0.0; 956 sum3 = 0.0; 957 sum4 = 0.0; 958 sum5 = 0.0; 959 sum6 = 0.0; 960 sum7 = 0.0; 961 sum8 = 0.0; 962 sum9 = 0.0; 963 sum10 = 0.0; 964 sum11 = 0.0; 965 sum12 = 0.0; 966 967 for (j = 0; j < n; j++) { 968 xb = x + 12 * (idx[j]); 969 970 for (k = 0; k < 12; k++) { 971 xv = xb[k]; 972 sum1 += v[0] * xv; 973 sum2 += v[1] * xv; 974 sum3 += v[2] * xv; 975 sum4 += v[3] * xv; 976 sum5 += v[4] * xv; 977 sum6 += v[5] * xv; 978 sum7 += v[6] * xv; 979 sum8 += v[7] * xv; 980 sum9 += v[8] * xv; 981 sum10 += v[9] * xv; 982 sum11 += v[10] * xv; 983 sum12 += v[11] * xv; 984 v += 12; 985 } 986 } 987 if (usecprow) z = zarray + 12 * ridx[i]; 988 z[0] = sum1; 989 z[1] = sum2; 990 z[2] = sum3; 991 z[3] = sum4; 992 z[4] = sum5; 993 z[5] = sum6; 994 z[6] = sum7; 995 z[7] = sum8; 996 z[8] = sum9; 997 z[9] = sum10; 998 z[10] = sum11; 999 z[11] = sum12; 1000 if (!usecprow) z += 12; 1001 } 1002 PetscCall(VecRestoreArrayRead(xx, &x)); 1003 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1004 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1005 PetscFunctionReturn(0); 1006 } 1007 1008 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) { 1009 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1010 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1011 const PetscScalar *x, *xb; 1012 PetscScalar *zarray, *yarray, xv; 1013 const MatScalar *v; 1014 const PetscInt *ii, *ij = a->j, *idx; 1015 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 1016 PetscBool usecprow = a->compressedrow.use; 1017 1018 PetscFunctionBegin; 1019 PetscCall(VecGetArrayRead(xx, &x)); 1020 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1021 1022 v = a->a; 1023 if (usecprow) { 1024 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1025 mbs = a->compressedrow.nrows; 1026 ii = a->compressedrow.i; 1027 ridx = a->compressedrow.rindex; 1028 } else { 1029 ii = a->i; 1030 y = yarray; 1031 z = zarray; 1032 } 1033 1034 for (i = 0; i < mbs; i++) { 1035 n = ii[i + 1] - ii[i]; 1036 idx = ij + ii[i]; 1037 1038 if (usecprow) { 1039 y = yarray + 12 * ridx[i]; 1040 z = zarray + 12 * ridx[i]; 1041 } 1042 sum1 = y[0]; 1043 sum2 = y[1]; 1044 sum3 = y[2]; 1045 sum4 = y[3]; 1046 sum5 = y[4]; 1047 sum6 = y[5]; 1048 sum7 = y[6]; 1049 sum8 = y[7]; 1050 sum9 = y[8]; 1051 sum10 = y[9]; 1052 sum11 = y[10]; 1053 sum12 = y[11]; 1054 1055 for (j = 0; j < n; j++) { 1056 xb = x + 12 * (idx[j]); 1057 1058 for (k = 0; k < 12; k++) { 1059 xv = xb[k]; 1060 sum1 += v[0] * xv; 1061 sum2 += v[1] * xv; 1062 sum3 += v[2] * xv; 1063 sum4 += v[3] * xv; 1064 sum5 += v[4] * xv; 1065 sum6 += v[5] * xv; 1066 sum7 += v[6] * xv; 1067 sum8 += v[7] * xv; 1068 sum9 += v[8] * xv; 1069 sum10 += v[9] * xv; 1070 sum11 += v[10] * xv; 1071 sum12 += v[11] * xv; 1072 v += 12; 1073 } 1074 } 1075 1076 z[0] = sum1; 1077 z[1] = sum2; 1078 z[2] = sum3; 1079 z[3] = sum4; 1080 z[4] = sum5; 1081 z[5] = sum6; 1082 z[6] = sum7; 1083 z[7] = sum8; 1084 z[8] = sum9; 1085 z[9] = sum10; 1086 z[10] = sum11; 1087 z[11] = sum12; 1088 if (!usecprow) { 1089 y += 12; 1090 z += 12; 1091 } 1092 } 1093 PetscCall(VecRestoreArrayRead(xx, &x)); 1094 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1095 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1096 PetscFunctionReturn(0); 1097 } 1098 1099 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1100 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) { 1101 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1102 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1103 const PetscScalar *x, *xb; 1104 PetscScalar x1, x2, x3, x4, *zarray; 1105 const MatScalar *v; 1106 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1107 PetscInt mbs, i, j, n; 1108 PetscBool usecprow = a->compressedrow.use; 1109 1110 PetscFunctionBegin; 1111 PetscCall(VecGetArrayRead(xx, &x)); 1112 PetscCall(VecGetArrayWrite(zz, &zarray)); 1113 1114 v = a->a; 1115 if (usecprow) { 1116 mbs = a->compressedrow.nrows; 1117 ii = a->compressedrow.i; 1118 ridx = a->compressedrow.rindex; 1119 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1120 } else { 1121 mbs = a->mbs; 1122 ii = a->i; 1123 z = zarray; 1124 } 1125 1126 for (i = 0; i < mbs; i++) { 1127 n = ii[i + 1] - ii[i]; 1128 idx = ij + ii[i]; 1129 1130 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1131 for (j = 0; j < n; j++) { 1132 xb = x + 12 * (idx[j]); 1133 x1 = xb[0]; 1134 x2 = xb[1]; 1135 x3 = xb[2]; 1136 x4 = xb[3]; 1137 1138 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1139 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1140 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1141 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1142 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1143 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1144 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1145 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1146 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1147 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1148 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1149 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1150 v += 48; 1151 1152 x1 = xb[4]; 1153 x2 = xb[5]; 1154 x3 = xb[6]; 1155 x4 = xb[7]; 1156 1157 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1158 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1159 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1160 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1161 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1162 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1163 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1164 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1165 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1166 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1167 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1168 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1169 v += 48; 1170 1171 x1 = xb[8]; 1172 x2 = xb[9]; 1173 x3 = xb[10]; 1174 x4 = xb[11]; 1175 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1176 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1177 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1178 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1179 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1180 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1181 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1182 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1183 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1184 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1185 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1186 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1187 v += 48; 1188 } 1189 if (usecprow) z = zarray + 12 * ridx[i]; 1190 z[0] = sum1; 1191 z[1] = sum2; 1192 z[2] = sum3; 1193 z[3] = sum4; 1194 z[4] = sum5; 1195 z[5] = sum6; 1196 z[6] = sum7; 1197 z[7] = sum8; 1198 z[8] = sum9; 1199 z[9] = sum10; 1200 z[10] = sum11; 1201 z[11] = sum12; 1202 if (!usecprow) z += 12; 1203 } 1204 PetscCall(VecRestoreArrayRead(xx, &x)); 1205 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1206 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1207 PetscFunctionReturn(0); 1208 } 1209 1210 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1211 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) { 1212 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1213 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1214 const PetscScalar *x, *xb; 1215 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1216 const MatScalar *v; 1217 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1218 PetscInt mbs = a->mbs, i, j, n; 1219 PetscBool usecprow = a->compressedrow.use; 1220 1221 PetscFunctionBegin; 1222 PetscCall(VecGetArrayRead(xx, &x)); 1223 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1224 1225 v = a->a; 1226 if (usecprow) { 1227 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1228 mbs = a->compressedrow.nrows; 1229 ii = a->compressedrow.i; 1230 ridx = a->compressedrow.rindex; 1231 } else { 1232 ii = a->i; 1233 y = yarray; 1234 z = zarray; 1235 } 1236 1237 for (i = 0; i < mbs; i++) { 1238 n = ii[i + 1] - ii[i]; 1239 idx = ij + ii[i]; 1240 1241 if (usecprow) { 1242 y = yarray + 12 * ridx[i]; 1243 z = zarray + 12 * ridx[i]; 1244 } 1245 sum1 = y[0]; 1246 sum2 = y[1]; 1247 sum3 = y[2]; 1248 sum4 = y[3]; 1249 sum5 = y[4]; 1250 sum6 = y[5]; 1251 sum7 = y[6]; 1252 sum8 = y[7]; 1253 sum9 = y[8]; 1254 sum10 = y[9]; 1255 sum11 = y[10]; 1256 sum12 = y[11]; 1257 1258 for (j = 0; j < n; j++) { 1259 xb = x + 12 * (idx[j]); 1260 x1 = xb[0]; 1261 x2 = xb[1]; 1262 x3 = xb[2]; 1263 x4 = xb[3]; 1264 1265 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1266 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1267 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1268 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1269 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1270 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1271 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1272 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1273 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1274 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1275 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1276 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1277 v += 48; 1278 1279 x1 = xb[4]; 1280 x2 = xb[5]; 1281 x3 = xb[6]; 1282 x4 = xb[7]; 1283 1284 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1285 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1286 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1287 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1288 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1289 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1290 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1291 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1292 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1293 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1294 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1295 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1296 v += 48; 1297 1298 x1 = xb[8]; 1299 x2 = xb[9]; 1300 x3 = xb[10]; 1301 x4 = xb[11]; 1302 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1303 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1304 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1305 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1306 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1307 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1308 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1309 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1310 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1311 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1312 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1313 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1314 v += 48; 1315 } 1316 z[0] = sum1; 1317 z[1] = sum2; 1318 z[2] = sum3; 1319 z[3] = sum4; 1320 z[4] = sum5; 1321 z[5] = sum6; 1322 z[6] = sum7; 1323 z[7] = sum8; 1324 z[8] = sum9; 1325 z[9] = sum10; 1326 z[10] = sum11; 1327 z[11] = sum12; 1328 if (!usecprow) { 1329 y += 12; 1330 z += 12; 1331 } 1332 } 1333 PetscCall(VecRestoreArrayRead(xx, &x)); 1334 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1335 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1336 PetscFunctionReturn(0); 1337 } 1338 1339 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1340 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) { 1341 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1342 PetscScalar *z = NULL, *zarray; 1343 const PetscScalar *x, *work; 1344 const MatScalar *v = a->a; 1345 PetscInt mbs, i, j, n; 1346 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1347 PetscBool usecprow = a->compressedrow.use; 1348 const PetscInt bs = 12, bs2 = 144; 1349 1350 __m256d a0, a1, a2, a3, a4, a5; 1351 __m256d w0, w1, w2, w3; 1352 __m256d z0, z1, z2; 1353 1354 PetscFunctionBegin; 1355 PetscCall(VecGetArrayRead(xx, &x)); 1356 PetscCall(VecGetArrayWrite(zz, &zarray)); 1357 1358 if (usecprow) { 1359 mbs = a->compressedrow.nrows; 1360 ii = a->compressedrow.i; 1361 ridx = a->compressedrow.rindex; 1362 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1363 } else { 1364 mbs = a->mbs; 1365 ii = a->i; 1366 z = zarray; 1367 } 1368 1369 for (i = 0; i < mbs; i++) { 1370 z0 = _mm256_setzero_pd(); 1371 z1 = _mm256_setzero_pd(); 1372 z2 = _mm256_setzero_pd(); 1373 1374 n = ii[1] - ii[0]; 1375 ii++; 1376 for (j = 0; j < n; j++) { 1377 work = x + bs * (*idx++); 1378 1379 /* first column of a */ 1380 w0 = _mm256_set1_pd(work[0]); 1381 a0 = _mm256_loadu_pd(v + 0); 1382 z0 = _mm256_fmadd_pd(a0, w0, z0); 1383 a1 = _mm256_loadu_pd(v + 4); 1384 z1 = _mm256_fmadd_pd(a1, w0, z1); 1385 a2 = _mm256_loadu_pd(v + 8); 1386 z2 = _mm256_fmadd_pd(a2, w0, z2); 1387 1388 /* second column of a */ 1389 w1 = _mm256_set1_pd(work[1]); 1390 a3 = _mm256_loadu_pd(v + 12); 1391 z0 = _mm256_fmadd_pd(a3, w1, z0); 1392 a4 = _mm256_loadu_pd(v + 16); 1393 z1 = _mm256_fmadd_pd(a4, w1, z1); 1394 a5 = _mm256_loadu_pd(v + 20); 1395 z2 = _mm256_fmadd_pd(a5, w1, z2); 1396 1397 /* third column of a */ 1398 w2 = _mm256_set1_pd(work[2]); 1399 a0 = _mm256_loadu_pd(v + 24); 1400 z0 = _mm256_fmadd_pd(a0, w2, z0); 1401 a1 = _mm256_loadu_pd(v + 28); 1402 z1 = _mm256_fmadd_pd(a1, w2, z1); 1403 a2 = _mm256_loadu_pd(v + 32); 1404 z2 = _mm256_fmadd_pd(a2, w2, z2); 1405 1406 /* fourth column of a */ 1407 w3 = _mm256_set1_pd(work[3]); 1408 a3 = _mm256_loadu_pd(v + 36); 1409 z0 = _mm256_fmadd_pd(a3, w3, z0); 1410 a4 = _mm256_loadu_pd(v + 40); 1411 z1 = _mm256_fmadd_pd(a4, w3, z1); 1412 a5 = _mm256_loadu_pd(v + 44); 1413 z2 = _mm256_fmadd_pd(a5, w3, z2); 1414 1415 /* fifth column of a */ 1416 w0 = _mm256_set1_pd(work[4]); 1417 a0 = _mm256_loadu_pd(v + 48); 1418 z0 = _mm256_fmadd_pd(a0, w0, z0); 1419 a1 = _mm256_loadu_pd(v + 52); 1420 z1 = _mm256_fmadd_pd(a1, w0, z1); 1421 a2 = _mm256_loadu_pd(v + 56); 1422 z2 = _mm256_fmadd_pd(a2, w0, z2); 1423 1424 /* sixth column of a */ 1425 w1 = _mm256_set1_pd(work[5]); 1426 a3 = _mm256_loadu_pd(v + 60); 1427 z0 = _mm256_fmadd_pd(a3, w1, z0); 1428 a4 = _mm256_loadu_pd(v + 64); 1429 z1 = _mm256_fmadd_pd(a4, w1, z1); 1430 a5 = _mm256_loadu_pd(v + 68); 1431 z2 = _mm256_fmadd_pd(a5, w1, z2); 1432 1433 /* seventh column of a */ 1434 w2 = _mm256_set1_pd(work[6]); 1435 a0 = _mm256_loadu_pd(v + 72); 1436 z0 = _mm256_fmadd_pd(a0, w2, z0); 1437 a1 = _mm256_loadu_pd(v + 76); 1438 z1 = _mm256_fmadd_pd(a1, w2, z1); 1439 a2 = _mm256_loadu_pd(v + 80); 1440 z2 = _mm256_fmadd_pd(a2, w2, z2); 1441 1442 /* eighth column of a */ 1443 w3 = _mm256_set1_pd(work[7]); 1444 a3 = _mm256_loadu_pd(v + 84); 1445 z0 = _mm256_fmadd_pd(a3, w3, z0); 1446 a4 = _mm256_loadu_pd(v + 88); 1447 z1 = _mm256_fmadd_pd(a4, w3, z1); 1448 a5 = _mm256_loadu_pd(v + 92); 1449 z2 = _mm256_fmadd_pd(a5, w3, z2); 1450 1451 /* ninth column of a */ 1452 w0 = _mm256_set1_pd(work[8]); 1453 a0 = _mm256_loadu_pd(v + 96); 1454 z0 = _mm256_fmadd_pd(a0, w0, z0); 1455 a1 = _mm256_loadu_pd(v + 100); 1456 z1 = _mm256_fmadd_pd(a1, w0, z1); 1457 a2 = _mm256_loadu_pd(v + 104); 1458 z2 = _mm256_fmadd_pd(a2, w0, z2); 1459 1460 /* tenth column of a */ 1461 w1 = _mm256_set1_pd(work[9]); 1462 a3 = _mm256_loadu_pd(v + 108); 1463 z0 = _mm256_fmadd_pd(a3, w1, z0); 1464 a4 = _mm256_loadu_pd(v + 112); 1465 z1 = _mm256_fmadd_pd(a4, w1, z1); 1466 a5 = _mm256_loadu_pd(v + 116); 1467 z2 = _mm256_fmadd_pd(a5, w1, z2); 1468 1469 /* eleventh column of a */ 1470 w2 = _mm256_set1_pd(work[10]); 1471 a0 = _mm256_loadu_pd(v + 120); 1472 z0 = _mm256_fmadd_pd(a0, w2, z0); 1473 a1 = _mm256_loadu_pd(v + 124); 1474 z1 = _mm256_fmadd_pd(a1, w2, z1); 1475 a2 = _mm256_loadu_pd(v + 128); 1476 z2 = _mm256_fmadd_pd(a2, w2, z2); 1477 1478 /* twelveth column of a */ 1479 w3 = _mm256_set1_pd(work[11]); 1480 a3 = _mm256_loadu_pd(v + 132); 1481 z0 = _mm256_fmadd_pd(a3, w3, z0); 1482 a4 = _mm256_loadu_pd(v + 136); 1483 z1 = _mm256_fmadd_pd(a4, w3, z1); 1484 a5 = _mm256_loadu_pd(v + 140); 1485 z2 = _mm256_fmadd_pd(a5, w3, z2); 1486 1487 v += bs2; 1488 } 1489 if (usecprow) z = zarray + bs * ridx[i]; 1490 _mm256_storeu_pd(&z[0], z0); 1491 _mm256_storeu_pd(&z[4], z1); 1492 _mm256_storeu_pd(&z[8], z2); 1493 if (!usecprow) z += bs; 1494 } 1495 PetscCall(VecRestoreArrayRead(xx, &x)); 1496 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1497 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1498 PetscFunctionReturn(0); 1499 } 1500 #endif 1501 1502 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1503 /* Default MatMult for block size 15 */ 1504 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) { 1505 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1506 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1507 const PetscScalar *x, *xb; 1508 PetscScalar *zarray, xv; 1509 const MatScalar *v; 1510 const PetscInt *ii, *ij = a->j, *idx; 1511 PetscInt mbs, i, j, k, n, *ridx = NULL; 1512 PetscBool usecprow = a->compressedrow.use; 1513 1514 PetscFunctionBegin; 1515 PetscCall(VecGetArrayRead(xx, &x)); 1516 PetscCall(VecGetArrayWrite(zz, &zarray)); 1517 1518 v = a->a; 1519 if (usecprow) { 1520 mbs = a->compressedrow.nrows; 1521 ii = a->compressedrow.i; 1522 ridx = a->compressedrow.rindex; 1523 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1524 } else { 1525 mbs = a->mbs; 1526 ii = a->i; 1527 z = zarray; 1528 } 1529 1530 for (i = 0; i < mbs; i++) { 1531 n = ii[i + 1] - ii[i]; 1532 idx = ij + ii[i]; 1533 sum1 = 0.0; 1534 sum2 = 0.0; 1535 sum3 = 0.0; 1536 sum4 = 0.0; 1537 sum5 = 0.0; 1538 sum6 = 0.0; 1539 sum7 = 0.0; 1540 sum8 = 0.0; 1541 sum9 = 0.0; 1542 sum10 = 0.0; 1543 sum11 = 0.0; 1544 sum12 = 0.0; 1545 sum13 = 0.0; 1546 sum14 = 0.0; 1547 sum15 = 0.0; 1548 1549 for (j = 0; j < n; j++) { 1550 xb = x + 15 * (idx[j]); 1551 1552 for (k = 0; k < 15; k++) { 1553 xv = xb[k]; 1554 sum1 += v[0] * xv; 1555 sum2 += v[1] * xv; 1556 sum3 += v[2] * xv; 1557 sum4 += v[3] * xv; 1558 sum5 += v[4] * xv; 1559 sum6 += v[5] * xv; 1560 sum7 += v[6] * xv; 1561 sum8 += v[7] * xv; 1562 sum9 += v[8] * xv; 1563 sum10 += v[9] * xv; 1564 sum11 += v[10] * xv; 1565 sum12 += v[11] * xv; 1566 sum13 += v[12] * xv; 1567 sum14 += v[13] * xv; 1568 sum15 += v[14] * xv; 1569 v += 15; 1570 } 1571 } 1572 if (usecprow) z = zarray + 15 * ridx[i]; 1573 z[0] = sum1; 1574 z[1] = sum2; 1575 z[2] = sum3; 1576 z[3] = sum4; 1577 z[4] = sum5; 1578 z[5] = sum6; 1579 z[6] = sum7; 1580 z[7] = sum8; 1581 z[8] = sum9; 1582 z[9] = sum10; 1583 z[10] = sum11; 1584 z[11] = sum12; 1585 z[12] = sum13; 1586 z[13] = sum14; 1587 z[14] = sum15; 1588 1589 if (!usecprow) z += 15; 1590 } 1591 1592 PetscCall(VecRestoreArrayRead(xx, &x)); 1593 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1594 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1595 PetscFunctionReturn(0); 1596 } 1597 1598 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1599 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) { 1600 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1601 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1602 const PetscScalar *x, *xb; 1603 PetscScalar x1, x2, x3, x4, *zarray; 1604 const MatScalar *v; 1605 const PetscInt *ii, *ij = a->j, *idx; 1606 PetscInt mbs, i, j, n, *ridx = NULL; 1607 PetscBool usecprow = a->compressedrow.use; 1608 1609 PetscFunctionBegin; 1610 PetscCall(VecGetArrayRead(xx, &x)); 1611 PetscCall(VecGetArrayWrite(zz, &zarray)); 1612 1613 v = a->a; 1614 if (usecprow) { 1615 mbs = a->compressedrow.nrows; 1616 ii = a->compressedrow.i; 1617 ridx = a->compressedrow.rindex; 1618 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1619 } else { 1620 mbs = a->mbs; 1621 ii = a->i; 1622 z = zarray; 1623 } 1624 1625 for (i = 0; i < mbs; i++) { 1626 n = ii[i + 1] - ii[i]; 1627 idx = ij + ii[i]; 1628 sum1 = 0.0; 1629 sum2 = 0.0; 1630 sum3 = 0.0; 1631 sum4 = 0.0; 1632 sum5 = 0.0; 1633 sum6 = 0.0; 1634 sum7 = 0.0; 1635 sum8 = 0.0; 1636 sum9 = 0.0; 1637 sum10 = 0.0; 1638 sum11 = 0.0; 1639 sum12 = 0.0; 1640 sum13 = 0.0; 1641 sum14 = 0.0; 1642 sum15 = 0.0; 1643 1644 for (j = 0; j < n; j++) { 1645 xb = x + 15 * (idx[j]); 1646 x1 = xb[0]; 1647 x2 = xb[1]; 1648 x3 = xb[2]; 1649 x4 = xb[3]; 1650 1651 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1652 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1653 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1654 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1655 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1656 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1657 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1658 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1659 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1660 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1661 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1662 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1663 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1664 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1665 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1666 1667 v += 60; 1668 1669 x1 = xb[4]; 1670 x2 = xb[5]; 1671 x3 = xb[6]; 1672 x4 = xb[7]; 1673 1674 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1675 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1676 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1677 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1678 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1679 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1680 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1681 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1682 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1683 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1684 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1685 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1686 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1687 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1688 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1689 v += 60; 1690 1691 x1 = xb[8]; 1692 x2 = xb[9]; 1693 x3 = xb[10]; 1694 x4 = xb[11]; 1695 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1696 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1697 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1698 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1699 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1700 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1701 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1702 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1703 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1704 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1705 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1706 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1707 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1708 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1709 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1710 v += 60; 1711 1712 x1 = xb[12]; 1713 x2 = xb[13]; 1714 x3 = xb[14]; 1715 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1716 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1717 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1718 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1719 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1720 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1721 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1722 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1723 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1724 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1725 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1726 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1727 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1728 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1729 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1730 v += 45; 1731 } 1732 if (usecprow) z = zarray + 15 * ridx[i]; 1733 z[0] = sum1; 1734 z[1] = sum2; 1735 z[2] = sum3; 1736 z[3] = sum4; 1737 z[4] = sum5; 1738 z[5] = sum6; 1739 z[6] = sum7; 1740 z[7] = sum8; 1741 z[8] = sum9; 1742 z[9] = sum10; 1743 z[10] = sum11; 1744 z[11] = sum12; 1745 z[12] = sum13; 1746 z[13] = sum14; 1747 z[14] = sum15; 1748 1749 if (!usecprow) z += 15; 1750 } 1751 1752 PetscCall(VecRestoreArrayRead(xx, &x)); 1753 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1754 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1759 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) { 1760 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1761 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1762 const PetscScalar *x, *xb; 1763 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1764 const MatScalar *v; 1765 const PetscInt *ii, *ij = a->j, *idx; 1766 PetscInt mbs, i, j, n, *ridx = NULL; 1767 PetscBool usecprow = a->compressedrow.use; 1768 1769 PetscFunctionBegin; 1770 PetscCall(VecGetArrayRead(xx, &x)); 1771 PetscCall(VecGetArrayWrite(zz, &zarray)); 1772 1773 v = a->a; 1774 if (usecprow) { 1775 mbs = a->compressedrow.nrows; 1776 ii = a->compressedrow.i; 1777 ridx = a->compressedrow.rindex; 1778 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1779 } else { 1780 mbs = a->mbs; 1781 ii = a->i; 1782 z = zarray; 1783 } 1784 1785 for (i = 0; i < mbs; i++) { 1786 n = ii[i + 1] - ii[i]; 1787 idx = ij + ii[i]; 1788 sum1 = 0.0; 1789 sum2 = 0.0; 1790 sum3 = 0.0; 1791 sum4 = 0.0; 1792 sum5 = 0.0; 1793 sum6 = 0.0; 1794 sum7 = 0.0; 1795 sum8 = 0.0; 1796 sum9 = 0.0; 1797 sum10 = 0.0; 1798 sum11 = 0.0; 1799 sum12 = 0.0; 1800 sum13 = 0.0; 1801 sum14 = 0.0; 1802 sum15 = 0.0; 1803 1804 for (j = 0; j < n; j++) { 1805 xb = x + 15 * (idx[j]); 1806 x1 = xb[0]; 1807 x2 = xb[1]; 1808 x3 = xb[2]; 1809 x4 = xb[3]; 1810 x5 = xb[4]; 1811 x6 = xb[5]; 1812 x7 = xb[6]; 1813 x8 = xb[7]; 1814 1815 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1816 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1817 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1818 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1819 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1820 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1821 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1822 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1823 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1824 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1825 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1826 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1827 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1828 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1829 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1830 v += 120; 1831 1832 x1 = xb[8]; 1833 x2 = xb[9]; 1834 x3 = xb[10]; 1835 x4 = xb[11]; 1836 x5 = xb[12]; 1837 x6 = xb[13]; 1838 x7 = xb[14]; 1839 1840 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1841 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1842 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1843 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1844 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1845 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1846 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1847 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1848 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1849 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1850 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1851 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1852 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1853 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1854 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1855 v += 105; 1856 } 1857 if (usecprow) z = zarray + 15 * ridx[i]; 1858 z[0] = sum1; 1859 z[1] = sum2; 1860 z[2] = sum3; 1861 z[3] = sum4; 1862 z[4] = sum5; 1863 z[5] = sum6; 1864 z[6] = sum7; 1865 z[7] = sum8; 1866 z[8] = sum9; 1867 z[9] = sum10; 1868 z[10] = sum11; 1869 z[11] = sum12; 1870 z[12] = sum13; 1871 z[13] = sum14; 1872 z[14] = sum15; 1873 1874 if (!usecprow) z += 15; 1875 } 1876 1877 PetscCall(VecRestoreArrayRead(xx, &x)); 1878 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1879 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1880 PetscFunctionReturn(0); 1881 } 1882 1883 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1884 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) { 1885 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1886 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1887 const PetscScalar *x, *xb; 1888 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1889 const MatScalar *v; 1890 const PetscInt *ii, *ij = a->j, *idx; 1891 PetscInt mbs, i, j, n, *ridx = NULL; 1892 PetscBool usecprow = a->compressedrow.use; 1893 1894 PetscFunctionBegin; 1895 PetscCall(VecGetArrayRead(xx, &x)); 1896 PetscCall(VecGetArrayWrite(zz, &zarray)); 1897 1898 v = a->a; 1899 if (usecprow) { 1900 mbs = a->compressedrow.nrows; 1901 ii = a->compressedrow.i; 1902 ridx = a->compressedrow.rindex; 1903 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1904 } else { 1905 mbs = a->mbs; 1906 ii = a->i; 1907 z = zarray; 1908 } 1909 1910 for (i = 0; i < mbs; i++) { 1911 n = ii[i + 1] - ii[i]; 1912 idx = ij + ii[i]; 1913 sum1 = 0.0; 1914 sum2 = 0.0; 1915 sum3 = 0.0; 1916 sum4 = 0.0; 1917 sum5 = 0.0; 1918 sum6 = 0.0; 1919 sum7 = 0.0; 1920 sum8 = 0.0; 1921 sum9 = 0.0; 1922 sum10 = 0.0; 1923 sum11 = 0.0; 1924 sum12 = 0.0; 1925 sum13 = 0.0; 1926 sum14 = 0.0; 1927 sum15 = 0.0; 1928 1929 for (j = 0; j < n; j++) { 1930 xb = x + 15 * (idx[j]); 1931 x1 = xb[0]; 1932 x2 = xb[1]; 1933 x3 = xb[2]; 1934 x4 = xb[3]; 1935 x5 = xb[4]; 1936 x6 = xb[5]; 1937 x7 = xb[6]; 1938 x8 = xb[7]; 1939 x9 = xb[8]; 1940 x10 = xb[9]; 1941 x11 = xb[10]; 1942 x12 = xb[11]; 1943 x13 = xb[12]; 1944 x14 = xb[13]; 1945 x15 = xb[14]; 1946 1947 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1948 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1949 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1950 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1951 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1952 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1953 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1954 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1955 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1956 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1957 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1958 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1959 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1960 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1961 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1962 v += 225; 1963 } 1964 if (usecprow) z = zarray + 15 * ridx[i]; 1965 z[0] = sum1; 1966 z[1] = sum2; 1967 z[2] = sum3; 1968 z[3] = sum4; 1969 z[4] = sum5; 1970 z[5] = sum6; 1971 z[6] = sum7; 1972 z[7] = sum8; 1973 z[8] = sum9; 1974 z[9] = sum10; 1975 z[10] = sum11; 1976 z[11] = sum12; 1977 z[12] = sum13; 1978 z[13] = sum14; 1979 z[14] = sum15; 1980 1981 if (!usecprow) z += 15; 1982 } 1983 1984 PetscCall(VecRestoreArrayRead(xx, &x)); 1985 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1986 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1987 PetscFunctionReturn(0); 1988 } 1989 1990 /* 1991 This will not work with MatScalar == float because it calls the BLAS 1992 */ 1993 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) { 1994 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1995 PetscScalar *z = NULL, *work, *workt, *zarray; 1996 const PetscScalar *x, *xb; 1997 const MatScalar *v; 1998 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1999 const PetscInt *idx, *ii, *ridx = NULL; 2000 PetscInt ncols, k; 2001 PetscBool usecprow = a->compressedrow.use; 2002 2003 PetscFunctionBegin; 2004 PetscCall(VecGetArrayRead(xx, &x)); 2005 PetscCall(VecGetArrayWrite(zz, &zarray)); 2006 2007 idx = a->j; 2008 v = a->a; 2009 if (usecprow) { 2010 mbs = a->compressedrow.nrows; 2011 ii = a->compressedrow.i; 2012 ridx = a->compressedrow.rindex; 2013 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2014 } else { 2015 mbs = a->mbs; 2016 ii = a->i; 2017 z = zarray; 2018 } 2019 2020 if (!a->mult_work) { 2021 k = PetscMax(A->rmap->n, A->cmap->n); 2022 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2023 } 2024 work = a->mult_work; 2025 for (i = 0; i < mbs; i++) { 2026 n = ii[1] - ii[0]; 2027 ii++; 2028 ncols = n * bs; 2029 workt = work; 2030 for (j = 0; j < n; j++) { 2031 xb = x + bs * (*idx++); 2032 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2033 workt += bs; 2034 } 2035 if (usecprow) z = zarray + bs * ridx[i]; 2036 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2037 v += n * bs2; 2038 if (!usecprow) z += bs; 2039 } 2040 PetscCall(VecRestoreArrayRead(xx, &x)); 2041 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2042 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2043 PetscFunctionReturn(0); 2044 } 2045 2046 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { 2047 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2048 const PetscScalar *x; 2049 PetscScalar *y, *z, sum; 2050 const MatScalar *v; 2051 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2052 const PetscInt *idx, *ii; 2053 PetscBool usecprow = a->compressedrow.use; 2054 2055 PetscFunctionBegin; 2056 PetscCall(VecGetArrayRead(xx, &x)); 2057 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2058 2059 idx = a->j; 2060 v = a->a; 2061 if (usecprow) { 2062 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 2063 mbs = a->compressedrow.nrows; 2064 ii = a->compressedrow.i; 2065 ridx = a->compressedrow.rindex; 2066 } else { 2067 ii = a->i; 2068 } 2069 2070 for (i = 0; i < mbs; i++) { 2071 n = ii[1] - ii[0]; 2072 ii++; 2073 if (!usecprow) { 2074 sum = y[i]; 2075 } else { 2076 sum = y[ridx[i]]; 2077 } 2078 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2079 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2080 PetscSparseDensePlusDot(sum, x, v, idx, n); 2081 v += n; 2082 idx += n; 2083 if (usecprow) { 2084 z[ridx[i]] = sum; 2085 } else { 2086 z[i] = sum; 2087 } 2088 } 2089 PetscCall(VecRestoreArrayRead(xx, &x)); 2090 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2091 PetscCall(PetscLogFlops(2.0 * a->nz)); 2092 PetscFunctionReturn(0); 2093 } 2094 2095 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { 2096 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2097 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2098 const PetscScalar *x, *xb; 2099 PetscScalar x1, x2, *yarray, *zarray; 2100 const MatScalar *v; 2101 PetscInt mbs = a->mbs, i, n, j; 2102 const PetscInt *idx, *ii, *ridx = NULL; 2103 PetscBool usecprow = a->compressedrow.use; 2104 2105 PetscFunctionBegin; 2106 PetscCall(VecGetArrayRead(xx, &x)); 2107 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2108 2109 idx = a->j; 2110 v = a->a; 2111 if (usecprow) { 2112 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 2113 mbs = a->compressedrow.nrows; 2114 ii = a->compressedrow.i; 2115 ridx = a->compressedrow.rindex; 2116 } else { 2117 ii = a->i; 2118 y = yarray; 2119 z = zarray; 2120 } 2121 2122 for (i = 0; i < mbs; i++) { 2123 n = ii[1] - ii[0]; 2124 ii++; 2125 if (usecprow) { 2126 z = zarray + 2 * ridx[i]; 2127 y = yarray + 2 * ridx[i]; 2128 } 2129 sum1 = y[0]; 2130 sum2 = y[1]; 2131 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2132 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2133 for (j = 0; j < n; j++) { 2134 xb = x + 2 * (*idx++); 2135 x1 = xb[0]; 2136 x2 = xb[1]; 2137 2138 sum1 += v[0] * x1 + v[2] * x2; 2139 sum2 += v[1] * x1 + v[3] * x2; 2140 v += 4; 2141 } 2142 z[0] = sum1; 2143 z[1] = sum2; 2144 if (!usecprow) { 2145 z += 2; 2146 y += 2; 2147 } 2148 } 2149 PetscCall(VecRestoreArrayRead(xx, &x)); 2150 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2151 PetscCall(PetscLogFlops(4.0 * a->nz)); 2152 PetscFunctionReturn(0); 2153 } 2154 2155 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { 2156 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2157 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2158 const PetscScalar *x, *xb; 2159 const MatScalar *v; 2160 PetscInt mbs = a->mbs, i, j, n; 2161 const PetscInt *idx, *ii, *ridx = NULL; 2162 PetscBool usecprow = a->compressedrow.use; 2163 2164 PetscFunctionBegin; 2165 PetscCall(VecGetArrayRead(xx, &x)); 2166 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2167 2168 idx = a->j; 2169 v = a->a; 2170 if (usecprow) { 2171 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 2172 mbs = a->compressedrow.nrows; 2173 ii = a->compressedrow.i; 2174 ridx = a->compressedrow.rindex; 2175 } else { 2176 ii = a->i; 2177 y = yarray; 2178 z = zarray; 2179 } 2180 2181 for (i = 0; i < mbs; i++) { 2182 n = ii[1] - ii[0]; 2183 ii++; 2184 if (usecprow) { 2185 z = zarray + 3 * ridx[i]; 2186 y = yarray + 3 * ridx[i]; 2187 } 2188 sum1 = y[0]; 2189 sum2 = y[1]; 2190 sum3 = y[2]; 2191 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2192 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2193 for (j = 0; j < n; j++) { 2194 xb = x + 3 * (*idx++); 2195 x1 = xb[0]; 2196 x2 = xb[1]; 2197 x3 = xb[2]; 2198 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2199 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2200 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2201 v += 9; 2202 } 2203 z[0] = sum1; 2204 z[1] = sum2; 2205 z[2] = sum3; 2206 if (!usecprow) { 2207 z += 3; 2208 y += 3; 2209 } 2210 } 2211 PetscCall(VecRestoreArrayRead(xx, &x)); 2212 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2213 PetscCall(PetscLogFlops(18.0 * a->nz)); 2214 PetscFunctionReturn(0); 2215 } 2216 2217 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { 2218 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2219 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2220 const PetscScalar *x, *xb; 2221 const MatScalar *v; 2222 PetscInt mbs = a->mbs, i, j, n; 2223 const PetscInt *idx, *ii, *ridx = NULL; 2224 PetscBool usecprow = a->compressedrow.use; 2225 2226 PetscFunctionBegin; 2227 PetscCall(VecGetArrayRead(xx, &x)); 2228 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2229 2230 idx = a->j; 2231 v = a->a; 2232 if (usecprow) { 2233 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 2234 mbs = a->compressedrow.nrows; 2235 ii = a->compressedrow.i; 2236 ridx = a->compressedrow.rindex; 2237 } else { 2238 ii = a->i; 2239 y = yarray; 2240 z = zarray; 2241 } 2242 2243 for (i = 0; i < mbs; i++) { 2244 n = ii[1] - ii[0]; 2245 ii++; 2246 if (usecprow) { 2247 z = zarray + 4 * ridx[i]; 2248 y = yarray + 4 * ridx[i]; 2249 } 2250 sum1 = y[0]; 2251 sum2 = y[1]; 2252 sum3 = y[2]; 2253 sum4 = y[3]; 2254 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2255 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2256 for (j = 0; j < n; j++) { 2257 xb = x + 4 * (*idx++); 2258 x1 = xb[0]; 2259 x2 = xb[1]; 2260 x3 = xb[2]; 2261 x4 = xb[3]; 2262 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2263 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2264 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2265 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2266 v += 16; 2267 } 2268 z[0] = sum1; 2269 z[1] = sum2; 2270 z[2] = sum3; 2271 z[3] = sum4; 2272 if (!usecprow) { 2273 z += 4; 2274 y += 4; 2275 } 2276 } 2277 PetscCall(VecRestoreArrayRead(xx, &x)); 2278 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2279 PetscCall(PetscLogFlops(32.0 * a->nz)); 2280 PetscFunctionReturn(0); 2281 } 2282 2283 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { 2284 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2285 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2286 const PetscScalar *x, *xb; 2287 PetscScalar *yarray, *zarray; 2288 const MatScalar *v; 2289 PetscInt mbs = a->mbs, i, j, n; 2290 const PetscInt *idx, *ii, *ridx = NULL; 2291 PetscBool usecprow = a->compressedrow.use; 2292 2293 PetscFunctionBegin; 2294 PetscCall(VecGetArrayRead(xx, &x)); 2295 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2296 2297 idx = a->j; 2298 v = a->a; 2299 if (usecprow) { 2300 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 2301 mbs = a->compressedrow.nrows; 2302 ii = a->compressedrow.i; 2303 ridx = a->compressedrow.rindex; 2304 } else { 2305 ii = a->i; 2306 y = yarray; 2307 z = zarray; 2308 } 2309 2310 for (i = 0; i < mbs; i++) { 2311 n = ii[1] - ii[0]; 2312 ii++; 2313 if (usecprow) { 2314 z = zarray + 5 * ridx[i]; 2315 y = yarray + 5 * ridx[i]; 2316 } 2317 sum1 = y[0]; 2318 sum2 = y[1]; 2319 sum3 = y[2]; 2320 sum4 = y[3]; 2321 sum5 = y[4]; 2322 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2323 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2324 for (j = 0; j < n; j++) { 2325 xb = x + 5 * (*idx++); 2326 x1 = xb[0]; 2327 x2 = xb[1]; 2328 x3 = xb[2]; 2329 x4 = xb[3]; 2330 x5 = xb[4]; 2331 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2332 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2333 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2334 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2335 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2336 v += 25; 2337 } 2338 z[0] = sum1; 2339 z[1] = sum2; 2340 z[2] = sum3; 2341 z[3] = sum4; 2342 z[4] = sum5; 2343 if (!usecprow) { 2344 z += 5; 2345 y += 5; 2346 } 2347 } 2348 PetscCall(VecRestoreArrayRead(xx, &x)); 2349 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2350 PetscCall(PetscLogFlops(50.0 * a->nz)); 2351 PetscFunctionReturn(0); 2352 } 2353 2354 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { 2355 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2356 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2357 const PetscScalar *x, *xb; 2358 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2359 const MatScalar *v; 2360 PetscInt mbs = a->mbs, i, j, n; 2361 const PetscInt *idx, *ii, *ridx = NULL; 2362 PetscBool usecprow = a->compressedrow.use; 2363 2364 PetscFunctionBegin; 2365 PetscCall(VecGetArrayRead(xx, &x)); 2366 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2367 2368 idx = a->j; 2369 v = a->a; 2370 if (usecprow) { 2371 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 2372 mbs = a->compressedrow.nrows; 2373 ii = a->compressedrow.i; 2374 ridx = a->compressedrow.rindex; 2375 } else { 2376 ii = a->i; 2377 y = yarray; 2378 z = zarray; 2379 } 2380 2381 for (i = 0; i < mbs; i++) { 2382 n = ii[1] - ii[0]; 2383 ii++; 2384 if (usecprow) { 2385 z = zarray + 6 * ridx[i]; 2386 y = yarray + 6 * ridx[i]; 2387 } 2388 sum1 = y[0]; 2389 sum2 = y[1]; 2390 sum3 = y[2]; 2391 sum4 = y[3]; 2392 sum5 = y[4]; 2393 sum6 = y[5]; 2394 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2395 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2396 for (j = 0; j < n; j++) { 2397 xb = x + 6 * (*idx++); 2398 x1 = xb[0]; 2399 x2 = xb[1]; 2400 x3 = xb[2]; 2401 x4 = xb[3]; 2402 x5 = xb[4]; 2403 x6 = xb[5]; 2404 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2405 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2406 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2407 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2408 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2409 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2410 v += 36; 2411 } 2412 z[0] = sum1; 2413 z[1] = sum2; 2414 z[2] = sum3; 2415 z[3] = sum4; 2416 z[4] = sum5; 2417 z[5] = sum6; 2418 if (!usecprow) { 2419 z += 6; 2420 y += 6; 2421 } 2422 } 2423 PetscCall(VecRestoreArrayRead(xx, &x)); 2424 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2425 PetscCall(PetscLogFlops(72.0 * a->nz)); 2426 PetscFunctionReturn(0); 2427 } 2428 2429 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { 2430 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2431 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2432 const PetscScalar *x, *xb; 2433 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2434 const MatScalar *v; 2435 PetscInt mbs = a->mbs, i, j, n; 2436 const PetscInt *idx, *ii, *ridx = NULL; 2437 PetscBool usecprow = a->compressedrow.use; 2438 2439 PetscFunctionBegin; 2440 PetscCall(VecGetArrayRead(xx, &x)); 2441 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2442 2443 idx = a->j; 2444 v = a->a; 2445 if (usecprow) { 2446 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2447 mbs = a->compressedrow.nrows; 2448 ii = a->compressedrow.i; 2449 ridx = a->compressedrow.rindex; 2450 } else { 2451 ii = a->i; 2452 y = yarray; 2453 z = zarray; 2454 } 2455 2456 for (i = 0; i < mbs; i++) { 2457 n = ii[1] - ii[0]; 2458 ii++; 2459 if (usecprow) { 2460 z = zarray + 7 * ridx[i]; 2461 y = yarray + 7 * ridx[i]; 2462 } 2463 sum1 = y[0]; 2464 sum2 = y[1]; 2465 sum3 = y[2]; 2466 sum4 = y[3]; 2467 sum5 = y[4]; 2468 sum6 = y[5]; 2469 sum7 = y[6]; 2470 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2471 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2472 for (j = 0; j < n; j++) { 2473 xb = x + 7 * (*idx++); 2474 x1 = xb[0]; 2475 x2 = xb[1]; 2476 x3 = xb[2]; 2477 x4 = xb[3]; 2478 x5 = xb[4]; 2479 x6 = xb[5]; 2480 x7 = xb[6]; 2481 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2482 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2483 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2484 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2485 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2486 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2487 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2488 v += 49; 2489 } 2490 z[0] = sum1; 2491 z[1] = sum2; 2492 z[2] = sum3; 2493 z[3] = sum4; 2494 z[4] = sum5; 2495 z[5] = sum6; 2496 z[6] = sum7; 2497 if (!usecprow) { 2498 z += 7; 2499 y += 7; 2500 } 2501 } 2502 PetscCall(VecRestoreArrayRead(xx, &x)); 2503 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2504 PetscCall(PetscLogFlops(98.0 * a->nz)); 2505 PetscFunctionReturn(0); 2506 } 2507 2508 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2509 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) { 2510 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2511 PetscScalar *z = NULL, *work, *workt, *zarray; 2512 const PetscScalar *x, *xb; 2513 const MatScalar *v; 2514 PetscInt mbs, i, j, n; 2515 PetscInt k; 2516 PetscBool usecprow = a->compressedrow.use; 2517 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2518 2519 __m256d a0, a1, a2, a3, a4, a5; 2520 __m256d w0, w1, w2, w3; 2521 __m256d z0, z1, z2; 2522 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2523 2524 PetscFunctionBegin; 2525 PetscCall(VecCopy(yy, zz)); 2526 PetscCall(VecGetArrayRead(xx, &x)); 2527 PetscCall(VecGetArray(zz, &zarray)); 2528 2529 idx = a->j; 2530 v = a->a; 2531 if (usecprow) { 2532 mbs = a->compressedrow.nrows; 2533 ii = a->compressedrow.i; 2534 ridx = a->compressedrow.rindex; 2535 } else { 2536 mbs = a->mbs; 2537 ii = a->i; 2538 z = zarray; 2539 } 2540 2541 if (!a->mult_work) { 2542 k = PetscMax(A->rmap->n, A->cmap->n); 2543 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2544 } 2545 2546 work = a->mult_work; 2547 for (i = 0; i < mbs; i++) { 2548 n = ii[1] - ii[0]; 2549 ii++; 2550 workt = work; 2551 for (j = 0; j < n; j++) { 2552 xb = x + bs * (*idx++); 2553 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2554 workt += bs; 2555 } 2556 if (usecprow) z = zarray + bs * ridx[i]; 2557 2558 z0 = _mm256_loadu_pd(&z[0]); 2559 z1 = _mm256_loadu_pd(&z[4]); 2560 z2 = _mm256_set1_pd(z[8]); 2561 2562 for (j = 0; j < n; j++) { 2563 /* first column of a */ 2564 w0 = _mm256_set1_pd(work[j * 9]); 2565 a0 = _mm256_loadu_pd(&v[j * 81]); 2566 z0 = _mm256_fmadd_pd(a0, w0, z0); 2567 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2568 z1 = _mm256_fmadd_pd(a1, w0, z1); 2569 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2570 z2 = _mm256_fmadd_pd(a2, w0, z2); 2571 2572 /* second column of a */ 2573 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2574 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2575 z0 = _mm256_fmadd_pd(a0, w1, z0); 2576 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2577 z1 = _mm256_fmadd_pd(a1, w1, z1); 2578 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2579 z2 = _mm256_fmadd_pd(a2, w1, z2); 2580 2581 /* third column of a */ 2582 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2583 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2584 z0 = _mm256_fmadd_pd(a3, w2, z0); 2585 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2586 z1 = _mm256_fmadd_pd(a4, w2, z1); 2587 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2588 z2 = _mm256_fmadd_pd(a5, w2, z2); 2589 2590 /* fourth column of a */ 2591 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2592 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2593 z0 = _mm256_fmadd_pd(a0, w3, z0); 2594 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2595 z1 = _mm256_fmadd_pd(a1, w3, z1); 2596 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2597 z2 = _mm256_fmadd_pd(a2, w3, z2); 2598 2599 /* fifth column of a */ 2600 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2601 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2602 z0 = _mm256_fmadd_pd(a3, w0, z0); 2603 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2604 z1 = _mm256_fmadd_pd(a4, w0, z1); 2605 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2606 z2 = _mm256_fmadd_pd(a5, w0, z2); 2607 2608 /* sixth column of a */ 2609 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2610 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2611 z0 = _mm256_fmadd_pd(a0, w1, z0); 2612 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2613 z1 = _mm256_fmadd_pd(a1, w1, z1); 2614 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2615 z2 = _mm256_fmadd_pd(a2, w1, z2); 2616 2617 /* seventh column of a */ 2618 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2619 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2620 z0 = _mm256_fmadd_pd(a0, w2, z0); 2621 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2622 z1 = _mm256_fmadd_pd(a1, w2, z1); 2623 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2624 z2 = _mm256_fmadd_pd(a2, w2, z2); 2625 2626 /* eighth column of a */ 2627 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2628 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2629 z0 = _mm256_fmadd_pd(a3, w3, z0); 2630 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2631 z1 = _mm256_fmadd_pd(a4, w3, z1); 2632 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2633 z2 = _mm256_fmadd_pd(a5, w3, z2); 2634 2635 /* ninth column of a */ 2636 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2637 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2638 z0 = _mm256_fmadd_pd(a0, w0, z0); 2639 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2640 z1 = _mm256_fmadd_pd(a1, w0, z1); 2641 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2642 z2 = _mm256_fmadd_pd(a2, w0, z2); 2643 } 2644 2645 _mm256_storeu_pd(&z[0], z0); 2646 _mm256_storeu_pd(&z[4], z1); 2647 _mm256_maskstore_pd(&z[8], mask1, z2); 2648 2649 v += n * bs2; 2650 if (!usecprow) z += bs; 2651 } 2652 PetscCall(VecRestoreArrayRead(xx, &x)); 2653 PetscCall(VecRestoreArray(zz, &zarray)); 2654 PetscCall(PetscLogFlops(162.0 * a->nz)); 2655 PetscFunctionReturn(0); 2656 } 2657 #endif 2658 2659 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) { 2660 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2661 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2662 const PetscScalar *x, *xb; 2663 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2664 const MatScalar *v; 2665 PetscInt mbs = a->mbs, i, j, n; 2666 const PetscInt *idx, *ii, *ridx = NULL; 2667 PetscBool usecprow = a->compressedrow.use; 2668 2669 PetscFunctionBegin; 2670 PetscCall(VecGetArrayRead(xx, &x)); 2671 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2672 2673 idx = a->j; 2674 v = a->a; 2675 if (usecprow) { 2676 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2677 mbs = a->compressedrow.nrows; 2678 ii = a->compressedrow.i; 2679 ridx = a->compressedrow.rindex; 2680 } else { 2681 ii = a->i; 2682 y = yarray; 2683 z = zarray; 2684 } 2685 2686 for (i = 0; i < mbs; i++) { 2687 n = ii[1] - ii[0]; 2688 ii++; 2689 if (usecprow) { 2690 z = zarray + 11 * ridx[i]; 2691 y = yarray + 11 * ridx[i]; 2692 } 2693 sum1 = y[0]; 2694 sum2 = y[1]; 2695 sum3 = y[2]; 2696 sum4 = y[3]; 2697 sum5 = y[4]; 2698 sum6 = y[5]; 2699 sum7 = y[6]; 2700 sum8 = y[7]; 2701 sum9 = y[8]; 2702 sum10 = y[9]; 2703 sum11 = y[10]; 2704 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2705 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2706 for (j = 0; j < n; j++) { 2707 xb = x + 11 * (*idx++); 2708 x1 = xb[0]; 2709 x2 = xb[1]; 2710 x3 = xb[2]; 2711 x4 = xb[3]; 2712 x5 = xb[4]; 2713 x6 = xb[5]; 2714 x7 = xb[6]; 2715 x8 = xb[7]; 2716 x9 = xb[8]; 2717 x10 = xb[9]; 2718 x11 = xb[10]; 2719 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2720 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2721 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2722 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2723 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2724 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2725 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2726 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2727 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2728 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2729 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2730 v += 121; 2731 } 2732 z[0] = sum1; 2733 z[1] = sum2; 2734 z[2] = sum3; 2735 z[3] = sum4; 2736 z[4] = sum5; 2737 z[5] = sum6; 2738 z[6] = sum7; 2739 z[7] = sum8; 2740 z[8] = sum9; 2741 z[9] = sum10; 2742 z[10] = sum11; 2743 if (!usecprow) { 2744 z += 11; 2745 y += 11; 2746 } 2747 } 2748 PetscCall(VecRestoreArrayRead(xx, &x)); 2749 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2750 PetscCall(PetscLogFlops(242.0 * a->nz)); 2751 PetscFunctionReturn(0); 2752 } 2753 2754 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { 2755 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2756 PetscScalar *z = NULL, *work, *workt, *zarray; 2757 const PetscScalar *x, *xb; 2758 const MatScalar *v; 2759 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2760 PetscInt ncols, k; 2761 const PetscInt *ridx = NULL, *idx, *ii; 2762 PetscBool usecprow = a->compressedrow.use; 2763 2764 PetscFunctionBegin; 2765 PetscCall(VecCopy(yy, zz)); 2766 PetscCall(VecGetArrayRead(xx, &x)); 2767 PetscCall(VecGetArray(zz, &zarray)); 2768 2769 idx = a->j; 2770 v = a->a; 2771 if (usecprow) { 2772 mbs = a->compressedrow.nrows; 2773 ii = a->compressedrow.i; 2774 ridx = a->compressedrow.rindex; 2775 } else { 2776 mbs = a->mbs; 2777 ii = a->i; 2778 z = zarray; 2779 } 2780 2781 if (!a->mult_work) { 2782 k = PetscMax(A->rmap->n, A->cmap->n); 2783 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2784 } 2785 work = a->mult_work; 2786 for (i = 0; i < mbs; i++) { 2787 n = ii[1] - ii[0]; 2788 ii++; 2789 ncols = n * bs; 2790 workt = work; 2791 for (j = 0; j < n; j++) { 2792 xb = x + bs * (*idx++); 2793 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2794 workt += bs; 2795 } 2796 if (usecprow) z = zarray + bs * ridx[i]; 2797 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2798 v += n * bs2; 2799 if (!usecprow) z += bs; 2800 } 2801 PetscCall(VecRestoreArrayRead(xx, &x)); 2802 PetscCall(VecRestoreArray(zz, &zarray)); 2803 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2804 PetscFunctionReturn(0); 2805 } 2806 2807 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 2808 PetscScalar zero = 0.0; 2809 2810 PetscFunctionBegin; 2811 PetscCall(VecSet(zz, zero)); 2812 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2813 PetscFunctionReturn(0); 2814 } 2815 2816 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 2817 PetscScalar zero = 0.0; 2818 2819 PetscFunctionBegin; 2820 PetscCall(VecSet(zz, zero)); 2821 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2822 PetscFunctionReturn(0); 2823 } 2824 2825 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 2826 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2827 PetscScalar *z, x1, x2, x3, x4, x5; 2828 const PetscScalar *x, *xb = NULL; 2829 const MatScalar *v; 2830 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2831 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2832 Mat_CompressedRow cprow = a->compressedrow; 2833 PetscBool usecprow = cprow.use; 2834 2835 PetscFunctionBegin; 2836 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2837 PetscCall(VecGetArrayRead(xx, &x)); 2838 PetscCall(VecGetArray(zz, &z)); 2839 2840 idx = a->j; 2841 v = a->a; 2842 if (usecprow) { 2843 mbs = cprow.nrows; 2844 ii = cprow.i; 2845 ridx = cprow.rindex; 2846 } else { 2847 mbs = a->mbs; 2848 ii = a->i; 2849 xb = x; 2850 } 2851 2852 switch (bs) { 2853 case 1: 2854 for (i = 0; i < mbs; i++) { 2855 if (usecprow) xb = x + ridx[i]; 2856 x1 = xb[0]; 2857 ib = idx + ii[0]; 2858 n = ii[1] - ii[0]; 2859 ii++; 2860 for (j = 0; j < n; j++) { 2861 rval = ib[j]; 2862 z[rval] += PetscConj(*v) * x1; 2863 v++; 2864 } 2865 if (!usecprow) xb++; 2866 } 2867 break; 2868 case 2: 2869 for (i = 0; i < mbs; i++) { 2870 if (usecprow) xb = x + 2 * ridx[i]; 2871 x1 = xb[0]; 2872 x2 = xb[1]; 2873 ib = idx + ii[0]; 2874 n = ii[1] - ii[0]; 2875 ii++; 2876 for (j = 0; j < n; j++) { 2877 rval = ib[j] * 2; 2878 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2879 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2880 v += 4; 2881 } 2882 if (!usecprow) xb += 2; 2883 } 2884 break; 2885 case 3: 2886 for (i = 0; i < mbs; i++) { 2887 if (usecprow) xb = x + 3 * ridx[i]; 2888 x1 = xb[0]; 2889 x2 = xb[1]; 2890 x3 = xb[2]; 2891 ib = idx + ii[0]; 2892 n = ii[1] - ii[0]; 2893 ii++; 2894 for (j = 0; j < n; j++) { 2895 rval = ib[j] * 3; 2896 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2897 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2898 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2899 v += 9; 2900 } 2901 if (!usecprow) xb += 3; 2902 } 2903 break; 2904 case 4: 2905 for (i = 0; i < mbs; i++) { 2906 if (usecprow) xb = x + 4 * ridx[i]; 2907 x1 = xb[0]; 2908 x2 = xb[1]; 2909 x3 = xb[2]; 2910 x4 = xb[3]; 2911 ib = idx + ii[0]; 2912 n = ii[1] - ii[0]; 2913 ii++; 2914 for (j = 0; j < n; j++) { 2915 rval = ib[j] * 4; 2916 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2917 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2918 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2919 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2920 v += 16; 2921 } 2922 if (!usecprow) xb += 4; 2923 } 2924 break; 2925 case 5: 2926 for (i = 0; i < mbs; i++) { 2927 if (usecprow) xb = x + 5 * ridx[i]; 2928 x1 = xb[0]; 2929 x2 = xb[1]; 2930 x3 = xb[2]; 2931 x4 = xb[3]; 2932 x5 = xb[4]; 2933 ib = idx + ii[0]; 2934 n = ii[1] - ii[0]; 2935 ii++; 2936 for (j = 0; j < n; j++) { 2937 rval = ib[j] * 5; 2938 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2939 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2940 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2941 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2942 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2943 v += 25; 2944 } 2945 if (!usecprow) xb += 5; 2946 } 2947 break; 2948 default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2949 #if 0 2950 { 2951 PetscInt ncols,k,bs2=a->bs2; 2952 PetscScalar *work,*workt,zb; 2953 const PetscScalar *xtmp; 2954 if (!a->mult_work) { 2955 k = PetscMax(A->rmap->n,A->cmap->n); 2956 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2957 } 2958 work = a->mult_work; 2959 xtmp = x; 2960 for (i=0; i<mbs; i++) { 2961 n = ii[1] - ii[0]; ii++; 2962 ncols = n*bs; 2963 PetscCall(PetscArrayzero(work,ncols)); 2964 if (usecprow) xtmp = x + bs*ridx[i]; 2965 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2966 v += n*bs2; 2967 if (!usecprow) xtmp += bs; 2968 workt = work; 2969 for (j=0; j<n; j++) { 2970 zb = z + bs*(*idx++); 2971 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2972 workt += bs; 2973 } 2974 } 2975 } 2976 #endif 2977 } 2978 PetscCall(VecRestoreArrayRead(xx, &x)); 2979 PetscCall(VecRestoreArray(zz, &z)); 2980 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 2981 PetscFunctionReturn(0); 2982 } 2983 2984 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 2985 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2986 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2987 const PetscScalar *x, *xb = NULL; 2988 const MatScalar *v; 2989 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2990 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2991 Mat_CompressedRow cprow = a->compressedrow; 2992 PetscBool usecprow = cprow.use; 2993 2994 PetscFunctionBegin; 2995 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2996 PetscCall(VecGetArrayRead(xx, &x)); 2997 PetscCall(VecGetArray(zz, &z)); 2998 2999 idx = a->j; 3000 v = a->a; 3001 if (usecprow) { 3002 mbs = cprow.nrows; 3003 ii = cprow.i; 3004 ridx = cprow.rindex; 3005 } else { 3006 mbs = a->mbs; 3007 ii = a->i; 3008 xb = x; 3009 } 3010 3011 switch (bs) { 3012 case 1: 3013 for (i = 0; i < mbs; i++) { 3014 if (usecprow) xb = x + ridx[i]; 3015 x1 = xb[0]; 3016 ib = idx + ii[0]; 3017 n = ii[1] - ii[0]; 3018 ii++; 3019 for (j = 0; j < n; j++) { 3020 rval = ib[j]; 3021 z[rval] += *v * x1; 3022 v++; 3023 } 3024 if (!usecprow) xb++; 3025 } 3026 break; 3027 case 2: 3028 for (i = 0; i < mbs; i++) { 3029 if (usecprow) xb = x + 2 * ridx[i]; 3030 x1 = xb[0]; 3031 x2 = xb[1]; 3032 ib = idx + ii[0]; 3033 n = ii[1] - ii[0]; 3034 ii++; 3035 for (j = 0; j < n; j++) { 3036 rval = ib[j] * 2; 3037 z[rval++] += v[0] * x1 + v[1] * x2; 3038 z[rval++] += v[2] * x1 + v[3] * x2; 3039 v += 4; 3040 } 3041 if (!usecprow) xb += 2; 3042 } 3043 break; 3044 case 3: 3045 for (i = 0; i < mbs; i++) { 3046 if (usecprow) xb = x + 3 * ridx[i]; 3047 x1 = xb[0]; 3048 x2 = xb[1]; 3049 x3 = xb[2]; 3050 ib = idx + ii[0]; 3051 n = ii[1] - ii[0]; 3052 ii++; 3053 for (j = 0; j < n; j++) { 3054 rval = ib[j] * 3; 3055 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3056 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3057 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3058 v += 9; 3059 } 3060 if (!usecprow) xb += 3; 3061 } 3062 break; 3063 case 4: 3064 for (i = 0; i < mbs; i++) { 3065 if (usecprow) xb = x + 4 * ridx[i]; 3066 x1 = xb[0]; 3067 x2 = xb[1]; 3068 x3 = xb[2]; 3069 x4 = xb[3]; 3070 ib = idx + ii[0]; 3071 n = ii[1] - ii[0]; 3072 ii++; 3073 for (j = 0; j < n; j++) { 3074 rval = ib[j] * 4; 3075 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3076 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3077 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3078 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3079 v += 16; 3080 } 3081 if (!usecprow) xb += 4; 3082 } 3083 break; 3084 case 5: 3085 for (i = 0; i < mbs; i++) { 3086 if (usecprow) xb = x + 5 * ridx[i]; 3087 x1 = xb[0]; 3088 x2 = xb[1]; 3089 x3 = xb[2]; 3090 x4 = xb[3]; 3091 x5 = xb[4]; 3092 ib = idx + ii[0]; 3093 n = ii[1] - ii[0]; 3094 ii++; 3095 for (j = 0; j < n; j++) { 3096 rval = ib[j] * 5; 3097 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3098 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3099 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3100 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3101 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3102 v += 25; 3103 } 3104 if (!usecprow) xb += 5; 3105 } 3106 break; 3107 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3108 PetscInt ncols, k; 3109 PetscScalar *work, *workt; 3110 const PetscScalar *xtmp; 3111 if (!a->mult_work) { 3112 k = PetscMax(A->rmap->n, A->cmap->n); 3113 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3114 } 3115 work = a->mult_work; 3116 xtmp = x; 3117 for (i = 0; i < mbs; i++) { 3118 n = ii[1] - ii[0]; 3119 ii++; 3120 ncols = n * bs; 3121 PetscCall(PetscArrayzero(work, ncols)); 3122 if (usecprow) xtmp = x + bs * ridx[i]; 3123 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3124 v += n * bs2; 3125 if (!usecprow) xtmp += bs; 3126 workt = work; 3127 for (j = 0; j < n; j++) { 3128 zb = z + bs * (*idx++); 3129 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3130 workt += bs; 3131 } 3132 } 3133 } 3134 } 3135 PetscCall(VecRestoreArrayRead(xx, &x)); 3136 PetscCall(VecRestoreArray(zz, &z)); 3137 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3138 PetscFunctionReturn(0); 3139 } 3140 3141 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) { 3142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3143 PetscInt totalnz = a->bs2 * a->nz; 3144 PetscScalar oalpha = alpha; 3145 PetscBLASInt one = 1, tnz; 3146 3147 PetscFunctionBegin; 3148 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3149 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3150 PetscCall(PetscLogFlops(totalnz)); 3151 PetscFunctionReturn(0); 3152 } 3153 3154 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) { 3155 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3156 MatScalar *v = a->a; 3157 PetscReal sum = 0.0; 3158 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3159 3160 PetscFunctionBegin; 3161 if (type == NORM_FROBENIUS) { 3162 #if defined(PETSC_USE_REAL___FP16) 3163 PetscBLASInt one = 1, cnt = bs2 * nz; 3164 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3165 #else 3166 for (i = 0; i < bs2 * nz; i++) { 3167 sum += PetscRealPart(PetscConj(*v) * (*v)); 3168 v++; 3169 } 3170 #endif 3171 *norm = PetscSqrtReal(sum); 3172 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3173 } else if (type == NORM_1) { /* maximum column sum */ 3174 PetscReal *tmp; 3175 PetscInt *bcol = a->j; 3176 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3177 for (i = 0; i < nz; i++) { 3178 for (j = 0; j < bs; j++) { 3179 k1 = bs * (*bcol) + j; /* column index */ 3180 for (k = 0; k < bs; k++) { 3181 tmp[k1] += PetscAbsScalar(*v); 3182 v++; 3183 } 3184 } 3185 bcol++; 3186 } 3187 *norm = 0.0; 3188 for (j = 0; j < A->cmap->n; j++) { 3189 if (tmp[j] > *norm) *norm = tmp[j]; 3190 } 3191 PetscCall(PetscFree(tmp)); 3192 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3193 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3194 *norm = 0.0; 3195 for (k = 0; k < bs; k++) { 3196 for (j = 0; j < a->mbs; j++) { 3197 v = a->a + bs2 * a->i[j] + k; 3198 sum = 0.0; 3199 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3200 for (k1 = 0; k1 < bs; k1++) { 3201 sum += PetscAbsScalar(*v); 3202 v += bs; 3203 } 3204 } 3205 if (sum > *norm) *norm = sum; 3206 } 3207 } 3208 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3209 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3210 PetscFunctionReturn(0); 3211 } 3212 3213 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) { 3214 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3215 3216 PetscFunctionBegin; 3217 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3218 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3219 *flg = PETSC_FALSE; 3220 PetscFunctionReturn(0); 3221 } 3222 3223 /* if the a->i are the same */ 3224 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3225 if (!*flg) PetscFunctionReturn(0); 3226 3227 /* if a->j are the same */ 3228 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3229 if (!*flg) PetscFunctionReturn(0); 3230 3231 /* if a->a are the same */ 3232 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3233 PetscFunctionReturn(0); 3234 } 3235 3236 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) { 3237 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3238 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3239 PetscScalar *x, zero = 0.0; 3240 MatScalar *aa, *aa_j; 3241 3242 PetscFunctionBegin; 3243 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3244 bs = A->rmap->bs; 3245 aa = a->a; 3246 ai = a->i; 3247 aj = a->j; 3248 ambs = a->mbs; 3249 bs2 = a->bs2; 3250 3251 PetscCall(VecSet(v, zero)); 3252 PetscCall(VecGetArray(v, &x)); 3253 PetscCall(VecGetLocalSize(v, &n)); 3254 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3255 for (i = 0; i < ambs; i++) { 3256 for (j = ai[i]; j < ai[i + 1]; j++) { 3257 if (aj[j] == i) { 3258 row = i * bs; 3259 aa_j = aa + j * bs2; 3260 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3261 break; 3262 } 3263 } 3264 } 3265 PetscCall(VecRestoreArray(v, &x)); 3266 PetscFunctionReturn(0); 3267 } 3268 3269 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) { 3270 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3271 const PetscScalar *l, *r, *li, *ri; 3272 PetscScalar x; 3273 MatScalar *aa, *v; 3274 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3275 const PetscInt *ai, *aj; 3276 3277 PetscFunctionBegin; 3278 ai = a->i; 3279 aj = a->j; 3280 aa = a->a; 3281 m = A->rmap->n; 3282 n = A->cmap->n; 3283 bs = A->rmap->bs; 3284 mbs = a->mbs; 3285 bs2 = a->bs2; 3286 if (ll) { 3287 PetscCall(VecGetArrayRead(ll, &l)); 3288 PetscCall(VecGetLocalSize(ll, &lm)); 3289 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3290 for (i = 0; i < mbs; i++) { /* for each block row */ 3291 M = ai[i + 1] - ai[i]; 3292 li = l + i * bs; 3293 v = aa + bs2 * ai[i]; 3294 for (j = 0; j < M; j++) { /* for each block */ 3295 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 3296 } 3297 } 3298 PetscCall(VecRestoreArrayRead(ll, &l)); 3299 PetscCall(PetscLogFlops(a->nz)); 3300 } 3301 3302 if (rr) { 3303 PetscCall(VecGetArrayRead(rr, &r)); 3304 PetscCall(VecGetLocalSize(rr, &rn)); 3305 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3306 for (i = 0; i < mbs; i++) { /* for each block row */ 3307 iai = ai[i]; 3308 M = ai[i + 1] - iai; 3309 v = aa + bs2 * iai; 3310 for (j = 0; j < M; j++) { /* for each block */ 3311 ri = r + bs * aj[iai + j]; 3312 for (k = 0; k < bs; k++) { 3313 x = ri[k]; 3314 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3315 v += bs; 3316 } 3317 } 3318 } 3319 PetscCall(VecRestoreArrayRead(rr, &r)); 3320 PetscCall(PetscLogFlops(a->nz)); 3321 } 3322 PetscFunctionReturn(0); 3323 } 3324 3325 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) { 3326 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3327 3328 PetscFunctionBegin; 3329 info->block_size = a->bs2; 3330 info->nz_allocated = a->bs2 * a->maxnz; 3331 info->nz_used = a->bs2 * a->nz; 3332 info->nz_unneeded = info->nz_allocated - info->nz_used; 3333 info->assemblies = A->num_ass; 3334 info->mallocs = A->info.mallocs; 3335 info->memory = 0; /* REVIEW ME */ 3336 if (A->factortype) { 3337 info->fill_ratio_given = A->info.fill_ratio_given; 3338 info->fill_ratio_needed = A->info.fill_ratio_needed; 3339 info->factor_mallocs = A->info.factor_mallocs; 3340 } else { 3341 info->fill_ratio_given = 0; 3342 info->fill_ratio_needed = 0; 3343 info->factor_mallocs = 0; 3344 } 3345 PetscFunctionReturn(0); 3346 } 3347 3348 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) { 3349 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3350 3351 PetscFunctionBegin; 3352 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3353 PetscFunctionReturn(0); 3354 } 3355 3356 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 3357 PetscFunctionBegin; 3358 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3359 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3360 PetscFunctionReturn(0); 3361 } 3362 3363 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3364 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3365 PetscScalar *z = NULL, sum1; 3366 const PetscScalar *xb; 3367 PetscScalar x1; 3368 const MatScalar *v, *vv; 3369 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3370 PetscBool usecprow = a->compressedrow.use; 3371 3372 PetscFunctionBegin; 3373 idx = a->j; 3374 v = a->a; 3375 if (usecprow) { 3376 mbs = a->compressedrow.nrows; 3377 ii = a->compressedrow.i; 3378 ridx = a->compressedrow.rindex; 3379 } else { 3380 mbs = a->mbs; 3381 ii = a->i; 3382 z = c; 3383 } 3384 3385 for (i = 0; i < mbs; i++) { 3386 n = ii[1] - ii[0]; 3387 ii++; 3388 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3389 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3390 if (usecprow) z = c + ridx[i]; 3391 jj = idx; 3392 vv = v; 3393 for (k = 0; k < cn; k++) { 3394 idx = jj; 3395 v = vv; 3396 sum1 = 0.0; 3397 for (j = 0; j < n; j++) { 3398 xb = b + (*idx++); 3399 x1 = xb[0 + k * bm]; 3400 sum1 += v[0] * x1; 3401 v += 1; 3402 } 3403 z[0 + k * cm] = sum1; 3404 } 3405 if (!usecprow) z += 1; 3406 } 3407 PetscFunctionReturn(0); 3408 } 3409 3410 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3411 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3412 PetscScalar *z = NULL, sum1, sum2; 3413 const PetscScalar *xb; 3414 PetscScalar x1, x2; 3415 const MatScalar *v, *vv; 3416 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3417 PetscBool usecprow = a->compressedrow.use; 3418 3419 PetscFunctionBegin; 3420 idx = a->j; 3421 v = a->a; 3422 if (usecprow) { 3423 mbs = a->compressedrow.nrows; 3424 ii = a->compressedrow.i; 3425 ridx = a->compressedrow.rindex; 3426 } else { 3427 mbs = a->mbs; 3428 ii = a->i; 3429 z = c; 3430 } 3431 3432 for (i = 0; i < mbs; i++) { 3433 n = ii[1] - ii[0]; 3434 ii++; 3435 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3436 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3437 if (usecprow) z = c + 2 * ridx[i]; 3438 jj = idx; 3439 vv = v; 3440 for (k = 0; k < cn; k++) { 3441 idx = jj; 3442 v = vv; 3443 sum1 = 0.0; 3444 sum2 = 0.0; 3445 for (j = 0; j < n; j++) { 3446 xb = b + 2 * (*idx++); 3447 x1 = xb[0 + k * bm]; 3448 x2 = xb[1 + k * bm]; 3449 sum1 += v[0] * x1 + v[2] * x2; 3450 sum2 += v[1] * x1 + v[3] * x2; 3451 v += 4; 3452 } 3453 z[0 + k * cm] = sum1; 3454 z[1 + k * cm] = sum2; 3455 } 3456 if (!usecprow) z += 2; 3457 } 3458 PetscFunctionReturn(0); 3459 } 3460 3461 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3462 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3463 PetscScalar *z = NULL, sum1, sum2, sum3; 3464 const PetscScalar *xb; 3465 PetscScalar x1, x2, x3; 3466 const MatScalar *v, *vv; 3467 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3468 PetscBool usecprow = a->compressedrow.use; 3469 3470 PetscFunctionBegin; 3471 idx = a->j; 3472 v = a->a; 3473 if (usecprow) { 3474 mbs = a->compressedrow.nrows; 3475 ii = a->compressedrow.i; 3476 ridx = a->compressedrow.rindex; 3477 } else { 3478 mbs = a->mbs; 3479 ii = a->i; 3480 z = c; 3481 } 3482 3483 for (i = 0; i < mbs; i++) { 3484 n = ii[1] - ii[0]; 3485 ii++; 3486 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3487 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3488 if (usecprow) z = c + 3 * ridx[i]; 3489 jj = idx; 3490 vv = v; 3491 for (k = 0; k < cn; k++) { 3492 idx = jj; 3493 v = vv; 3494 sum1 = 0.0; 3495 sum2 = 0.0; 3496 sum3 = 0.0; 3497 for (j = 0; j < n; j++) { 3498 xb = b + 3 * (*idx++); 3499 x1 = xb[0 + k * bm]; 3500 x2 = xb[1 + k * bm]; 3501 x3 = xb[2 + k * bm]; 3502 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3503 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3504 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3505 v += 9; 3506 } 3507 z[0 + k * cm] = sum1; 3508 z[1 + k * cm] = sum2; 3509 z[2 + k * cm] = sum3; 3510 } 3511 if (!usecprow) z += 3; 3512 } 3513 PetscFunctionReturn(0); 3514 } 3515 3516 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3517 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3518 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3519 const PetscScalar *xb; 3520 PetscScalar x1, x2, x3, x4; 3521 const MatScalar *v, *vv; 3522 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3523 PetscBool usecprow = a->compressedrow.use; 3524 3525 PetscFunctionBegin; 3526 idx = a->j; 3527 v = a->a; 3528 if (usecprow) { 3529 mbs = a->compressedrow.nrows; 3530 ii = a->compressedrow.i; 3531 ridx = a->compressedrow.rindex; 3532 } else { 3533 mbs = a->mbs; 3534 ii = a->i; 3535 z = c; 3536 } 3537 3538 for (i = 0; i < mbs; i++) { 3539 n = ii[1] - ii[0]; 3540 ii++; 3541 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3542 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3543 if (usecprow) z = c + 4 * ridx[i]; 3544 jj = idx; 3545 vv = v; 3546 for (k = 0; k < cn; k++) { 3547 idx = jj; 3548 v = vv; 3549 sum1 = 0.0; 3550 sum2 = 0.0; 3551 sum3 = 0.0; 3552 sum4 = 0.0; 3553 for (j = 0; j < n; j++) { 3554 xb = b + 4 * (*idx++); 3555 x1 = xb[0 + k * bm]; 3556 x2 = xb[1 + k * bm]; 3557 x3 = xb[2 + k * bm]; 3558 x4 = xb[3 + k * bm]; 3559 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3560 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3561 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3562 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3563 v += 16; 3564 } 3565 z[0 + k * cm] = sum1; 3566 z[1 + k * cm] = sum2; 3567 z[2 + k * cm] = sum3; 3568 z[3 + k * cm] = sum4; 3569 } 3570 if (!usecprow) z += 4; 3571 } 3572 PetscFunctionReturn(0); 3573 } 3574 3575 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 3576 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3577 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3578 const PetscScalar *xb; 3579 PetscScalar x1, x2, x3, x4, x5; 3580 const MatScalar *v, *vv; 3581 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3582 PetscBool usecprow = a->compressedrow.use; 3583 3584 PetscFunctionBegin; 3585 idx = a->j; 3586 v = a->a; 3587 if (usecprow) { 3588 mbs = a->compressedrow.nrows; 3589 ii = a->compressedrow.i; 3590 ridx = a->compressedrow.rindex; 3591 } else { 3592 mbs = a->mbs; 3593 ii = a->i; 3594 z = c; 3595 } 3596 3597 for (i = 0; i < mbs; i++) { 3598 n = ii[1] - ii[0]; 3599 ii++; 3600 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3601 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3602 if (usecprow) z = c + 5 * ridx[i]; 3603 jj = idx; 3604 vv = v; 3605 for (k = 0; k < cn; k++) { 3606 idx = jj; 3607 v = vv; 3608 sum1 = 0.0; 3609 sum2 = 0.0; 3610 sum3 = 0.0; 3611 sum4 = 0.0; 3612 sum5 = 0.0; 3613 for (j = 0; j < n; j++) { 3614 xb = b + 5 * (*idx++); 3615 x1 = xb[0 + k * bm]; 3616 x2 = xb[1 + k * bm]; 3617 x3 = xb[2 + k * bm]; 3618 x4 = xb[3 + k * bm]; 3619 x5 = xb[4 + k * bm]; 3620 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3621 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3622 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3623 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3624 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3625 v += 25; 3626 } 3627 z[0 + k * cm] = sum1; 3628 z[1 + k * cm] = sum2; 3629 z[2 + k * cm] = sum3; 3630 z[3 + k * cm] = sum4; 3631 z[4 + k * cm] = sum5; 3632 } 3633 if (!usecprow) z += 5; 3634 } 3635 PetscFunctionReturn(0); 3636 } 3637 3638 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) { 3639 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3640 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3641 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3642 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3643 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3644 PetscBLASInt bbs, bcn, bbm, bcm; 3645 PetscScalar *z = NULL; 3646 PetscScalar *c, *b; 3647 const MatScalar *v; 3648 const PetscInt *idx, *ii, *ridx = NULL; 3649 PetscScalar _DZero = 0.0, _DOne = 1.0; 3650 PetscBool usecprow = a->compressedrow.use; 3651 3652 PetscFunctionBegin; 3653 if (!cm || !cn) PetscFunctionReturn(0); 3654 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3655 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3656 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3657 b = bd->v; 3658 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 3659 PetscCall(MatDenseGetArray(C, &c)); 3660 switch (bs) { 3661 case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; 3662 case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; 3663 case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; 3664 case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; 3665 case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; 3666 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3667 PetscCall(PetscBLASIntCast(bs, &bbs)); 3668 PetscCall(PetscBLASIntCast(cn, &bcn)); 3669 PetscCall(PetscBLASIntCast(bm, &bbm)); 3670 PetscCall(PetscBLASIntCast(cm, &bcm)); 3671 idx = a->j; 3672 v = a->a; 3673 if (usecprow) { 3674 mbs = a->compressedrow.nrows; 3675 ii = a->compressedrow.i; 3676 ridx = a->compressedrow.rindex; 3677 } else { 3678 mbs = a->mbs; 3679 ii = a->i; 3680 z = c; 3681 } 3682 for (i = 0; i < mbs; i++) { 3683 n = ii[1] - ii[0]; 3684 ii++; 3685 if (usecprow) z = c + bs * ridx[i]; 3686 if (n) { 3687 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3688 v += bs2; 3689 } 3690 for (j = 1; j < n; j++) { 3691 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3692 v += bs2; 3693 } 3694 if (!usecprow) z += bs; 3695 } 3696 } 3697 PetscCall(MatDenseRestoreArray(C, &c)); 3698 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3699 PetscFunctionReturn(0); 3700 } 3701