1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 15 const PetscInt *idx; 16 PetscInt start, end, *ai, *aj, bs, *nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m, &table)); 28 PetscCall(PetscMalloc1(m + 1, &nidx)); 29 PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 30 31 for (i = 0; i < is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 PetscCall(PetscBTMemzero(m, table)); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 PetscCall(ISGetIndices(is[i], &idx)); 38 PetscCall(ISGetLocalSize(is[i], &n)); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j = 0; j < n; ++j) { 42 ival = idx[j] / bs; /* convert the indices into block indices */ 43 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 44 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 45 } 46 PetscCall(ISRestoreIndices(is[i], &idx)); 47 PetscCall(ISDestroy(&is[i])); 48 49 k = 0; 50 for (j = 0; j < ov; j++) { /* for each overlap*/ 51 n = isz; 52 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row + 1]; 56 for (l = start; l < end; l++) { 57 val = aj[l]; 58 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j = 0; j < isz; j++) { 64 for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 65 } 66 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 67 } 68 PetscCall(PetscBTDestroy(&table)); 69 PetscCall(PetscFree(nidx)); 70 PetscCall(PetscFree(nidx2)); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 75 { 76 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 77 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 78 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 79 const PetscInt *irow, *icol; 80 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 81 PetscInt *aj = a->j, *ai = a->i; 82 MatScalar *mat_a; 83 Mat C; 84 PetscBool flag; 85 86 PetscFunctionBegin; 87 PetscCall(ISGetIndices(isrow, &irow)); 88 PetscCall(ISGetIndices(iscol, &icol)); 89 PetscCall(ISGetLocalSize(isrow, &nrows)); 90 PetscCall(ISGetLocalSize(iscol, &ncols)); 91 92 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 93 ssmap = smap; 94 PetscCall(PetscMalloc1(1 + nrows, &lens)); 95 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 96 /* determine lens of each row */ 97 for (i = 0; i < nrows; i++) { 98 kstart = ai[irow[i]]; 99 kend = kstart + a->ilen[irow[i]]; 100 lens[i] = 0; 101 for (k = kstart; k < kend; k++) { 102 if (ssmap[aj[k]]) lens[i]++; 103 } 104 } 105 /* Create and fill new matrix */ 106 if (scall == MAT_REUSE_MATRIX) { 107 c = (Mat_SeqBAIJ *)((*B)->data); 108 109 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 110 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 111 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 112 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 113 C = *B; 114 } else { 115 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 116 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 117 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 118 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 119 } 120 c = (Mat_SeqBAIJ *)(C->data); 121 for (i = 0; i < nrows; i++) { 122 row = irow[i]; 123 kstart = ai[row]; 124 kend = kstart + a->ilen[row]; 125 mat_i = c->i[i]; 126 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 127 mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 128 mat_ilen = c->ilen + i; 129 for (k = kstart; k < kend; k++) { 130 if ((tcol = ssmap[a->j[k]])) { 131 *mat_j++ = tcol - 1; 132 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 133 mat_a += bs2; 134 (*mat_ilen)++; 135 } 136 } 137 } 138 /* sort */ 139 if (c->j && c->a) { 140 MatScalar *work; 141 PetscCall(PetscMalloc1(bs2, &work)); 142 for (i = 0; i < nrows; i++) { 143 PetscInt ilen; 144 mat_i = c->i[i]; 145 mat_j = c->j + mat_i; 146 mat_a = c->a + mat_i * bs2; 147 ilen = c->ilen[i]; 148 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 149 } 150 PetscCall(PetscFree(work)); 151 } 152 153 /* Free work space */ 154 PetscCall(ISRestoreIndices(iscol, &icol)); 155 PetscCall(PetscFree(smap)); 156 PetscCall(PetscFree(lens)); 157 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 158 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 159 160 PetscCall(ISRestoreIndices(isrow, &irow)); 161 *B = C; 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 168 IS is1, is2; 169 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 170 const PetscInt *irow, *icol; 171 172 PetscFunctionBegin; 173 PetscCall(ISGetIndices(isrow, &irow)); 174 PetscCall(ISGetIndices(iscol, &icol)); 175 PetscCall(ISGetLocalSize(isrow, &nrows)); 176 PetscCall(ISGetLocalSize(iscol, &ncols)); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 maxmnbs = PetscMax(a->mbs, a->nbs); 181 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 182 PetscCall(PetscArrayzero(vary, a->mbs)); 183 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 184 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 185 count = 0; 186 for (i = 0; i < nrows; i++) { 187 j = irow[i] / bs; 188 if ((vary[j]--) == bs) iary[count++] = j; 189 } 190 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 191 192 PetscCall(PetscArrayzero(vary, a->nbs)); 193 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 194 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 195 count = 0; 196 for (i = 0; i < ncols; i++) { 197 j = icol[i] / bs; 198 if ((vary[j]--) == bs) iary[count++] = j; 199 } 200 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 201 PetscCall(ISRestoreIndices(isrow, &irow)); 202 PetscCall(ISRestoreIndices(iscol, &icol)); 203 PetscCall(PetscFree2(vary, iary)); 204 205 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 206 PetscCall(ISDestroy(&is1)); 207 PetscCall(ISDestroy(&is2)); 208 PetscFunctionReturn(0); 209 } 210 211 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 212 { 213 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 214 Mat_SubSppt *submatj = c->submatis1; 215 216 PetscFunctionBegin; 217 PetscCall((*submatj->destroy)(C)); 218 PetscCall(MatDestroySubMatrix_Private(submatj)); 219 PetscFunctionReturn(0); 220 } 221 222 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 223 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 224 { 225 PetscInt i; 226 Mat C; 227 Mat_SeqBAIJ *c; 228 Mat_SubSppt *submatj; 229 230 PetscFunctionBegin; 231 for (i = 0; i < n; i++) { 232 C = (*mat)[i]; 233 c = (Mat_SeqBAIJ *)C->data; 234 submatj = c->submatis1; 235 if (submatj) { 236 if (--((PetscObject)C)->refct <= 0) { 237 PetscCall(PetscFree(C->factorprefix)); 238 PetscCall((*submatj->destroy)(C)); 239 PetscCall(MatDestroySubMatrix_Private(submatj)); 240 PetscCall(PetscFree(C->defaultvectype)); 241 PetscCall(PetscFree(C->defaultrandtype)); 242 PetscCall(PetscLayoutDestroy(&C->rmap)); 243 PetscCall(PetscLayoutDestroy(&C->cmap)); 244 PetscCall(PetscHeaderDestroy(&C)); 245 } 246 } else { 247 PetscCall(MatDestroy(&C)); 248 } 249 } 250 251 /* Destroy Dummy submatrices created for reuse */ 252 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 253 254 PetscCall(PetscFree(*mat)); 255 PetscFunctionReturn(0); 256 } 257 258 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 259 { 260 PetscInt i; 261 262 PetscFunctionBegin; 263 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 264 265 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 266 PetscFunctionReturn(0); 267 } 268 269 /* -------------------------------------------------------*/ 270 /* Should check that shapes of vectors and matrices match */ 271 /* -------------------------------------------------------*/ 272 273 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 274 { 275 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 276 PetscScalar *z, sum; 277 const PetscScalar *x; 278 const MatScalar *v; 279 PetscInt mbs, i, n; 280 const PetscInt *idx, *ii, *ridx = NULL; 281 PetscBool usecprow = a->compressedrow.use; 282 283 PetscFunctionBegin; 284 PetscCall(VecGetArrayRead(xx, &x)); 285 PetscCall(VecGetArrayWrite(zz, &z)); 286 287 if (usecprow) { 288 mbs = a->compressedrow.nrows; 289 ii = a->compressedrow.i; 290 ridx = a->compressedrow.rindex; 291 PetscCall(PetscArrayzero(z, a->mbs)); 292 } else { 293 mbs = a->mbs; 294 ii = a->i; 295 } 296 297 for (i = 0; i < mbs; i++) { 298 n = ii[1] - ii[0]; 299 v = a->a + ii[0]; 300 idx = a->j + ii[0]; 301 ii++; 302 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 303 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 304 sum = 0.0; 305 PetscSparseDensePlusDot(sum, x, v, idx, n); 306 if (usecprow) { 307 z[ridx[i]] = sum; 308 } else { 309 z[i] = sum; 310 } 311 } 312 PetscCall(VecRestoreArrayRead(xx, &x)); 313 PetscCall(VecRestoreArrayWrite(zz, &z)); 314 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 315 PetscFunctionReturn(0); 316 } 317 318 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 319 { 320 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 321 PetscScalar *z = NULL, sum1, sum2, *zarray; 322 const PetscScalar *x, *xb; 323 PetscScalar x1, x2; 324 const MatScalar *v; 325 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 326 PetscBool usecprow = a->compressedrow.use; 327 328 PetscFunctionBegin; 329 PetscCall(VecGetArrayRead(xx, &x)); 330 PetscCall(VecGetArrayWrite(zz, &zarray)); 331 332 idx = a->j; 333 v = a->a; 334 if (usecprow) { 335 mbs = a->compressedrow.nrows; 336 ii = a->compressedrow.i; 337 ridx = a->compressedrow.rindex; 338 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 339 } else { 340 mbs = a->mbs; 341 ii = a->i; 342 z = zarray; 343 } 344 345 for (i = 0; i < mbs; i++) { 346 n = ii[1] - ii[0]; 347 ii++; 348 sum1 = 0.0; 349 sum2 = 0.0; 350 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 351 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 352 for (j = 0; j < n; j++) { 353 xb = x + 2 * (*idx++); 354 x1 = xb[0]; 355 x2 = xb[1]; 356 sum1 += v[0] * x1 + v[2] * x2; 357 sum2 += v[1] * x1 + v[3] * x2; 358 v += 4; 359 } 360 if (usecprow) z = zarray + 2 * ridx[i]; 361 z[0] = sum1; 362 z[1] = sum2; 363 if (!usecprow) z += 2; 364 } 365 PetscCall(VecRestoreArrayRead(xx, &x)); 366 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 367 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 368 PetscFunctionReturn(0); 369 } 370 371 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 372 { 373 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 374 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 375 const PetscScalar *x, *xb; 376 const MatScalar *v; 377 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 378 PetscBool usecprow = a->compressedrow.use; 379 380 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 381 #pragma disjoint(*v, *z, *xb) 382 #endif 383 384 PetscFunctionBegin; 385 PetscCall(VecGetArrayRead(xx, &x)); 386 PetscCall(VecGetArrayWrite(zz, &zarray)); 387 388 idx = a->j; 389 v = a->a; 390 if (usecprow) { 391 mbs = a->compressedrow.nrows; 392 ii = a->compressedrow.i; 393 ridx = a->compressedrow.rindex; 394 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 395 } else { 396 mbs = a->mbs; 397 ii = a->i; 398 z = zarray; 399 } 400 401 for (i = 0; i < mbs; i++) { 402 n = ii[1] - ii[0]; 403 ii++; 404 sum1 = 0.0; 405 sum2 = 0.0; 406 sum3 = 0.0; 407 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 408 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 409 for (j = 0; j < n; j++) { 410 xb = x + 3 * (*idx++); 411 x1 = xb[0]; 412 x2 = xb[1]; 413 x3 = xb[2]; 414 415 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 416 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 417 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 418 v += 9; 419 } 420 if (usecprow) z = zarray + 3 * ridx[i]; 421 z[0] = sum1; 422 z[1] = sum2; 423 z[2] = sum3; 424 if (!usecprow) z += 3; 425 } 426 PetscCall(VecRestoreArrayRead(xx, &x)); 427 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 428 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 429 PetscFunctionReturn(0); 430 } 431 432 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 433 { 434 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 435 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 436 const PetscScalar *x, *xb; 437 const MatScalar *v; 438 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 439 PetscBool usecprow = a->compressedrow.use; 440 441 PetscFunctionBegin; 442 PetscCall(VecGetArrayRead(xx, &x)); 443 PetscCall(VecGetArrayWrite(zz, &zarray)); 444 445 idx = a->j; 446 v = a->a; 447 if (usecprow) { 448 mbs = a->compressedrow.nrows; 449 ii = a->compressedrow.i; 450 ridx = a->compressedrow.rindex; 451 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 452 } else { 453 mbs = a->mbs; 454 ii = a->i; 455 z = zarray; 456 } 457 458 for (i = 0; i < mbs; i++) { 459 n = ii[1] - ii[0]; 460 ii++; 461 sum1 = 0.0; 462 sum2 = 0.0; 463 sum3 = 0.0; 464 sum4 = 0.0; 465 466 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 467 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 468 for (j = 0; j < n; j++) { 469 xb = x + 4 * (*idx++); 470 x1 = xb[0]; 471 x2 = xb[1]; 472 x3 = xb[2]; 473 x4 = xb[3]; 474 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 475 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 476 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 477 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 478 v += 16; 479 } 480 if (usecprow) z = zarray + 4 * ridx[i]; 481 z[0] = sum1; 482 z[1] = sum2; 483 z[2] = sum3; 484 z[3] = sum4; 485 if (!usecprow) z += 4; 486 } 487 PetscCall(VecRestoreArrayRead(xx, &x)); 488 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 489 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 490 PetscFunctionReturn(0); 491 } 492 493 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 494 { 495 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 496 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 497 const PetscScalar *xb, *x; 498 const MatScalar *v; 499 const PetscInt *idx, *ii, *ridx = NULL; 500 PetscInt mbs, i, j, n; 501 PetscBool usecprow = a->compressedrow.use; 502 503 PetscFunctionBegin; 504 PetscCall(VecGetArrayRead(xx, &x)); 505 PetscCall(VecGetArrayWrite(zz, &zarray)); 506 507 idx = a->j; 508 v = a->a; 509 if (usecprow) { 510 mbs = a->compressedrow.nrows; 511 ii = a->compressedrow.i; 512 ridx = a->compressedrow.rindex; 513 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 514 } else { 515 mbs = a->mbs; 516 ii = a->i; 517 z = zarray; 518 } 519 520 for (i = 0; i < mbs; i++) { 521 n = ii[1] - ii[0]; 522 ii++; 523 sum1 = 0.0; 524 sum2 = 0.0; 525 sum3 = 0.0; 526 sum4 = 0.0; 527 sum5 = 0.0; 528 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 529 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 530 for (j = 0; j < n; j++) { 531 xb = x + 5 * (*idx++); 532 x1 = xb[0]; 533 x2 = xb[1]; 534 x3 = xb[2]; 535 x4 = xb[3]; 536 x5 = xb[4]; 537 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 538 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 539 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 540 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 541 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 542 v += 25; 543 } 544 if (usecprow) z = zarray + 5 * ridx[i]; 545 z[0] = sum1; 546 z[1] = sum2; 547 z[2] = sum3; 548 z[3] = sum4; 549 z[4] = sum5; 550 if (!usecprow) z += 5; 551 } 552 PetscCall(VecRestoreArrayRead(xx, &x)); 553 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 554 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 555 PetscFunctionReturn(0); 556 } 557 558 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 559 { 560 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 561 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 562 const PetscScalar *x, *xb; 563 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 564 const MatScalar *v; 565 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 566 PetscBool usecprow = a->compressedrow.use; 567 568 PetscFunctionBegin; 569 PetscCall(VecGetArrayRead(xx, &x)); 570 PetscCall(VecGetArrayWrite(zz, &zarray)); 571 572 idx = a->j; 573 v = a->a; 574 if (usecprow) { 575 mbs = a->compressedrow.nrows; 576 ii = a->compressedrow.i; 577 ridx = a->compressedrow.rindex; 578 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 579 } else { 580 mbs = a->mbs; 581 ii = a->i; 582 z = zarray; 583 } 584 585 for (i = 0; i < mbs; i++) { 586 n = ii[1] - ii[0]; 587 ii++; 588 sum1 = 0.0; 589 sum2 = 0.0; 590 sum3 = 0.0; 591 sum4 = 0.0; 592 sum5 = 0.0; 593 sum6 = 0.0; 594 595 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 596 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 597 for (j = 0; j < n; j++) { 598 xb = x + 6 * (*idx++); 599 x1 = xb[0]; 600 x2 = xb[1]; 601 x3 = xb[2]; 602 x4 = xb[3]; 603 x5 = xb[4]; 604 x6 = xb[5]; 605 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 606 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 607 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 608 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 609 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 610 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 611 v += 36; 612 } 613 if (usecprow) z = zarray + 6 * ridx[i]; 614 z[0] = sum1; 615 z[1] = sum2; 616 z[2] = sum3; 617 z[3] = sum4; 618 z[4] = sum5; 619 z[5] = sum6; 620 if (!usecprow) z += 6; 621 } 622 623 PetscCall(VecRestoreArrayRead(xx, &x)); 624 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 625 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 626 PetscFunctionReturn(0); 627 } 628 629 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 630 { 631 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 632 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 633 const PetscScalar *x, *xb; 634 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 635 const MatScalar *v; 636 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 637 PetscBool usecprow = a->compressedrow.use; 638 639 PetscFunctionBegin; 640 PetscCall(VecGetArrayRead(xx, &x)); 641 PetscCall(VecGetArrayWrite(zz, &zarray)); 642 643 idx = a->j; 644 v = a->a; 645 if (usecprow) { 646 mbs = a->compressedrow.nrows; 647 ii = a->compressedrow.i; 648 ridx = a->compressedrow.rindex; 649 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 650 } else { 651 mbs = a->mbs; 652 ii = a->i; 653 z = zarray; 654 } 655 656 for (i = 0; i < mbs; i++) { 657 n = ii[1] - ii[0]; 658 ii++; 659 sum1 = 0.0; 660 sum2 = 0.0; 661 sum3 = 0.0; 662 sum4 = 0.0; 663 sum5 = 0.0; 664 sum6 = 0.0; 665 sum7 = 0.0; 666 667 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 668 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 669 for (j = 0; j < n; j++) { 670 xb = x + 7 * (*idx++); 671 x1 = xb[0]; 672 x2 = xb[1]; 673 x3 = xb[2]; 674 x4 = xb[3]; 675 x5 = xb[4]; 676 x6 = xb[5]; 677 x7 = xb[6]; 678 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 679 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 680 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 681 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 682 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 683 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 684 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 685 v += 49; 686 } 687 if (usecprow) z = zarray + 7 * ridx[i]; 688 z[0] = sum1; 689 z[1] = sum2; 690 z[2] = sum3; 691 z[3] = sum4; 692 z[4] = sum5; 693 z[5] = sum6; 694 z[6] = sum7; 695 if (!usecprow) z += 7; 696 } 697 698 PetscCall(VecRestoreArrayRead(xx, &x)); 699 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 700 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 701 PetscFunctionReturn(0); 702 } 703 704 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 705 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 706 { 707 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 708 PetscScalar *z = NULL, *work, *workt, *zarray; 709 const PetscScalar *x, *xb; 710 const MatScalar *v; 711 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 712 const PetscInt *idx, *ii, *ridx = NULL; 713 PetscInt k; 714 PetscBool usecprow = a->compressedrow.use; 715 716 __m256d a0, a1, a2, a3, a4, a5; 717 __m256d w0, w1, w2, w3; 718 __m256d z0, z1, z2; 719 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 720 721 PetscFunctionBegin; 722 PetscCall(VecGetArrayRead(xx, &x)); 723 PetscCall(VecGetArrayWrite(zz, &zarray)); 724 725 idx = a->j; 726 v = a->a; 727 if (usecprow) { 728 mbs = a->compressedrow.nrows; 729 ii = a->compressedrow.i; 730 ridx = a->compressedrow.rindex; 731 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 732 } else { 733 mbs = a->mbs; 734 ii = a->i; 735 z = zarray; 736 } 737 738 if (!a->mult_work) { 739 k = PetscMax(A->rmap->n, A->cmap->n); 740 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 741 } 742 743 work = a->mult_work; 744 for (i = 0; i < mbs; i++) { 745 n = ii[1] - ii[0]; 746 ii++; 747 workt = work; 748 for (j = 0; j < n; j++) { 749 xb = x + bs * (*idx++); 750 for (k = 0; k < bs; k++) workt[k] = xb[k]; 751 workt += bs; 752 } 753 if (usecprow) z = zarray + bs * ridx[i]; 754 755 z0 = _mm256_setzero_pd(); 756 z1 = _mm256_setzero_pd(); 757 z2 = _mm256_setzero_pd(); 758 759 for (j = 0; j < n; j++) { 760 /* first column of a */ 761 w0 = _mm256_set1_pd(work[j * 9]); 762 a0 = _mm256_loadu_pd(&v[j * 81]); 763 z0 = _mm256_fmadd_pd(a0, w0, z0); 764 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 765 z1 = _mm256_fmadd_pd(a1, w0, z1); 766 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 767 z2 = _mm256_fmadd_pd(a2, w0, z2); 768 769 /* second column of a */ 770 w1 = _mm256_set1_pd(work[j * 9 + 1]); 771 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 772 z0 = _mm256_fmadd_pd(a0, w1, z0); 773 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 774 z1 = _mm256_fmadd_pd(a1, w1, z1); 775 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 776 z2 = _mm256_fmadd_pd(a2, w1, z2); 777 778 /* third column of a */ 779 w2 = _mm256_set1_pd(work[j * 9 + 2]); 780 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 781 z0 = _mm256_fmadd_pd(a3, w2, z0); 782 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 783 z1 = _mm256_fmadd_pd(a4, w2, z1); 784 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 785 z2 = _mm256_fmadd_pd(a5, w2, z2); 786 787 /* fourth column of a */ 788 w3 = _mm256_set1_pd(work[j * 9 + 3]); 789 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 790 z0 = _mm256_fmadd_pd(a0, w3, z0); 791 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 792 z1 = _mm256_fmadd_pd(a1, w3, z1); 793 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 794 z2 = _mm256_fmadd_pd(a2, w3, z2); 795 796 /* fifth column of a */ 797 w0 = _mm256_set1_pd(work[j * 9 + 4]); 798 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 799 z0 = _mm256_fmadd_pd(a3, w0, z0); 800 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 801 z1 = _mm256_fmadd_pd(a4, w0, z1); 802 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 803 z2 = _mm256_fmadd_pd(a5, w0, z2); 804 805 /* sixth column of a */ 806 w1 = _mm256_set1_pd(work[j * 9 + 5]); 807 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 808 z0 = _mm256_fmadd_pd(a0, w1, z0); 809 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 810 z1 = _mm256_fmadd_pd(a1, w1, z1); 811 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 812 z2 = _mm256_fmadd_pd(a2, w1, z2); 813 814 /* seventh column of a */ 815 w2 = _mm256_set1_pd(work[j * 9 + 6]); 816 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 817 z0 = _mm256_fmadd_pd(a0, w2, z0); 818 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 819 z1 = _mm256_fmadd_pd(a1, w2, z1); 820 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 821 z2 = _mm256_fmadd_pd(a2, w2, z2); 822 823 /* eighth column of a */ 824 w3 = _mm256_set1_pd(work[j * 9 + 7]); 825 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 826 z0 = _mm256_fmadd_pd(a3, w3, z0); 827 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 828 z1 = _mm256_fmadd_pd(a4, w3, z1); 829 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 830 z2 = _mm256_fmadd_pd(a5, w3, z2); 831 832 /* ninth column of a */ 833 w0 = _mm256_set1_pd(work[j * 9 + 8]); 834 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 835 z0 = _mm256_fmadd_pd(a0, w0, z0); 836 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 837 z1 = _mm256_fmadd_pd(a1, w0, z1); 838 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 839 z2 = _mm256_fmadd_pd(a2, w0, z2); 840 } 841 842 _mm256_storeu_pd(&z[0], z0); 843 _mm256_storeu_pd(&z[4], z1); 844 _mm256_maskstore_pd(&z[8], mask1, z2); 845 846 v += n * bs2; 847 if (!usecprow) z += bs; 848 } 849 PetscCall(VecRestoreArrayRead(xx, &x)); 850 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 851 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 852 PetscFunctionReturn(0); 853 } 854 #endif 855 856 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 857 { 858 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 859 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 860 const PetscScalar *x, *xb; 861 PetscScalar *zarray, xv; 862 const MatScalar *v; 863 const PetscInt *ii, *ij = a->j, *idx; 864 PetscInt mbs, i, j, k, n, *ridx = NULL; 865 PetscBool usecprow = a->compressedrow.use; 866 867 PetscFunctionBegin; 868 PetscCall(VecGetArrayRead(xx, &x)); 869 PetscCall(VecGetArrayWrite(zz, &zarray)); 870 871 v = a->a; 872 if (usecprow) { 873 mbs = a->compressedrow.nrows; 874 ii = a->compressedrow.i; 875 ridx = a->compressedrow.rindex; 876 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 877 } else { 878 mbs = a->mbs; 879 ii = a->i; 880 z = zarray; 881 } 882 883 for (i = 0; i < mbs; i++) { 884 n = ii[i + 1] - ii[i]; 885 idx = ij + ii[i]; 886 sum1 = 0.0; 887 sum2 = 0.0; 888 sum3 = 0.0; 889 sum4 = 0.0; 890 sum5 = 0.0; 891 sum6 = 0.0; 892 sum7 = 0.0; 893 sum8 = 0.0; 894 sum9 = 0.0; 895 sum10 = 0.0; 896 sum11 = 0.0; 897 898 for (j = 0; j < n; j++) { 899 xb = x + 11 * (idx[j]); 900 901 for (k = 0; k < 11; k++) { 902 xv = xb[k]; 903 sum1 += v[0] * xv; 904 sum2 += v[1] * xv; 905 sum3 += v[2] * xv; 906 sum4 += v[3] * xv; 907 sum5 += v[4] * xv; 908 sum6 += v[5] * xv; 909 sum7 += v[6] * xv; 910 sum8 += v[7] * xv; 911 sum9 += v[8] * xv; 912 sum10 += v[9] * xv; 913 sum11 += v[10] * xv; 914 v += 11; 915 } 916 } 917 if (usecprow) z = zarray + 11 * ridx[i]; 918 z[0] = sum1; 919 z[1] = sum2; 920 z[2] = sum3; 921 z[3] = sum4; 922 z[4] = sum5; 923 z[5] = sum6; 924 z[6] = sum7; 925 z[7] = sum8; 926 z[8] = sum9; 927 z[9] = sum10; 928 z[10] = sum11; 929 930 if (!usecprow) z += 11; 931 } 932 933 PetscCall(VecRestoreArrayRead(xx, &x)); 934 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 935 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 936 PetscFunctionReturn(0); 937 } 938 939 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 940 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 941 { 942 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 943 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 944 const PetscScalar *x, *xb; 945 PetscScalar *zarray, xv; 946 const MatScalar *v; 947 const PetscInt *ii, *ij = a->j, *idx; 948 PetscInt mbs, i, j, k, n, *ridx = NULL; 949 PetscBool usecprow = a->compressedrow.use; 950 951 PetscFunctionBegin; 952 PetscCall(VecGetArrayRead(xx, &x)); 953 PetscCall(VecGetArrayWrite(zz, &zarray)); 954 955 v = a->a; 956 if (usecprow) { 957 mbs = a->compressedrow.nrows; 958 ii = a->compressedrow.i; 959 ridx = a->compressedrow.rindex; 960 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 961 } else { 962 mbs = a->mbs; 963 ii = a->i; 964 z = zarray; 965 } 966 967 for (i = 0; i < mbs; i++) { 968 n = ii[i + 1] - ii[i]; 969 idx = ij + ii[i]; 970 sum1 = 0.0; 971 sum2 = 0.0; 972 sum3 = 0.0; 973 sum4 = 0.0; 974 sum5 = 0.0; 975 sum6 = 0.0; 976 sum7 = 0.0; 977 sum8 = 0.0; 978 sum9 = 0.0; 979 sum10 = 0.0; 980 sum11 = 0.0; 981 sum12 = 0.0; 982 983 for (j = 0; j < n; j++) { 984 xb = x + 12 * (idx[j]); 985 986 for (k = 0; k < 12; k++) { 987 xv = xb[k]; 988 sum1 += v[0] * xv; 989 sum2 += v[1] * xv; 990 sum3 += v[2] * xv; 991 sum4 += v[3] * xv; 992 sum5 += v[4] * xv; 993 sum6 += v[5] * xv; 994 sum7 += v[6] * xv; 995 sum8 += v[7] * xv; 996 sum9 += v[8] * xv; 997 sum10 += v[9] * xv; 998 sum11 += v[10] * xv; 999 sum12 += v[11] * xv; 1000 v += 12; 1001 } 1002 } 1003 if (usecprow) z = zarray + 12 * ridx[i]; 1004 z[0] = sum1; 1005 z[1] = sum2; 1006 z[2] = sum3; 1007 z[3] = sum4; 1008 z[4] = sum5; 1009 z[5] = sum6; 1010 z[6] = sum7; 1011 z[7] = sum8; 1012 z[8] = sum9; 1013 z[9] = sum10; 1014 z[10] = sum11; 1015 z[11] = sum12; 1016 if (!usecprow) z += 12; 1017 } 1018 PetscCall(VecRestoreArrayRead(xx, &x)); 1019 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1020 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1021 PetscFunctionReturn(0); 1022 } 1023 1024 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 1025 { 1026 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1027 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1028 const PetscScalar *x, *xb; 1029 PetscScalar *zarray, *yarray, xv; 1030 const MatScalar *v; 1031 const PetscInt *ii, *ij = a->j, *idx; 1032 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 1033 PetscBool usecprow = a->compressedrow.use; 1034 1035 PetscFunctionBegin; 1036 PetscCall(VecGetArrayRead(xx, &x)); 1037 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1038 1039 v = a->a; 1040 if (usecprow) { 1041 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1042 mbs = a->compressedrow.nrows; 1043 ii = a->compressedrow.i; 1044 ridx = a->compressedrow.rindex; 1045 } else { 1046 ii = a->i; 1047 y = yarray; 1048 z = zarray; 1049 } 1050 1051 for (i = 0; i < mbs; i++) { 1052 n = ii[i + 1] - ii[i]; 1053 idx = ij + ii[i]; 1054 1055 if (usecprow) { 1056 y = yarray + 12 * ridx[i]; 1057 z = zarray + 12 * ridx[i]; 1058 } 1059 sum1 = y[0]; 1060 sum2 = y[1]; 1061 sum3 = y[2]; 1062 sum4 = y[3]; 1063 sum5 = y[4]; 1064 sum6 = y[5]; 1065 sum7 = y[6]; 1066 sum8 = y[7]; 1067 sum9 = y[8]; 1068 sum10 = y[9]; 1069 sum11 = y[10]; 1070 sum12 = y[11]; 1071 1072 for (j = 0; j < n; j++) { 1073 xb = x + 12 * (idx[j]); 1074 1075 for (k = 0; k < 12; k++) { 1076 xv = xb[k]; 1077 sum1 += v[0] * xv; 1078 sum2 += v[1] * xv; 1079 sum3 += v[2] * xv; 1080 sum4 += v[3] * xv; 1081 sum5 += v[4] * xv; 1082 sum6 += v[5] * xv; 1083 sum7 += v[6] * xv; 1084 sum8 += v[7] * xv; 1085 sum9 += v[8] * xv; 1086 sum10 += v[9] * xv; 1087 sum11 += v[10] * xv; 1088 sum12 += v[11] * xv; 1089 v += 12; 1090 } 1091 } 1092 1093 z[0] = sum1; 1094 z[1] = sum2; 1095 z[2] = sum3; 1096 z[3] = sum4; 1097 z[4] = sum5; 1098 z[5] = sum6; 1099 z[6] = sum7; 1100 z[7] = sum8; 1101 z[8] = sum9; 1102 z[9] = sum10; 1103 z[10] = sum11; 1104 z[11] = sum12; 1105 if (!usecprow) { 1106 y += 12; 1107 z += 12; 1108 } 1109 } 1110 PetscCall(VecRestoreArrayRead(xx, &x)); 1111 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1112 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1113 PetscFunctionReturn(0); 1114 } 1115 1116 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1117 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1118 { 1119 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1120 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1121 const PetscScalar *x, *xb; 1122 PetscScalar x1, x2, x3, x4, *zarray; 1123 const MatScalar *v; 1124 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1125 PetscInt mbs, i, j, n; 1126 PetscBool usecprow = a->compressedrow.use; 1127 1128 PetscFunctionBegin; 1129 PetscCall(VecGetArrayRead(xx, &x)); 1130 PetscCall(VecGetArrayWrite(zz, &zarray)); 1131 1132 v = a->a; 1133 if (usecprow) { 1134 mbs = a->compressedrow.nrows; 1135 ii = a->compressedrow.i; 1136 ridx = a->compressedrow.rindex; 1137 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1138 } else { 1139 mbs = a->mbs; 1140 ii = a->i; 1141 z = zarray; 1142 } 1143 1144 for (i = 0; i < mbs; i++) { 1145 n = ii[i + 1] - ii[i]; 1146 idx = ij + ii[i]; 1147 1148 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1149 for (j = 0; j < n; j++) { 1150 xb = x + 12 * (idx[j]); 1151 x1 = xb[0]; 1152 x2 = xb[1]; 1153 x3 = xb[2]; 1154 x4 = xb[3]; 1155 1156 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1157 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1158 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1159 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1160 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1161 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1162 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1163 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1164 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1165 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1166 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1167 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1168 v += 48; 1169 1170 x1 = xb[4]; 1171 x2 = xb[5]; 1172 x3 = xb[6]; 1173 x4 = xb[7]; 1174 1175 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1176 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1177 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1178 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1179 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1180 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1181 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1182 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1183 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1184 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1185 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1186 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1187 v += 48; 1188 1189 x1 = xb[8]; 1190 x2 = xb[9]; 1191 x3 = xb[10]; 1192 x4 = xb[11]; 1193 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1194 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1195 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1196 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1197 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1198 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1199 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1200 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1201 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1202 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1203 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1204 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1205 v += 48; 1206 } 1207 if (usecprow) z = zarray + 12 * ridx[i]; 1208 z[0] = sum1; 1209 z[1] = sum2; 1210 z[2] = sum3; 1211 z[3] = sum4; 1212 z[4] = sum5; 1213 z[5] = sum6; 1214 z[6] = sum7; 1215 z[7] = sum8; 1216 z[8] = sum9; 1217 z[9] = sum10; 1218 z[10] = sum11; 1219 z[11] = sum12; 1220 if (!usecprow) z += 12; 1221 } 1222 PetscCall(VecRestoreArrayRead(xx, &x)); 1223 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1224 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1225 PetscFunctionReturn(0); 1226 } 1227 1228 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1229 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1230 { 1231 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1232 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1233 const PetscScalar *x, *xb; 1234 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1235 const MatScalar *v; 1236 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1237 PetscInt mbs = a->mbs, i, j, n; 1238 PetscBool usecprow = a->compressedrow.use; 1239 1240 PetscFunctionBegin; 1241 PetscCall(VecGetArrayRead(xx, &x)); 1242 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1243 1244 v = a->a; 1245 if (usecprow) { 1246 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1247 mbs = a->compressedrow.nrows; 1248 ii = a->compressedrow.i; 1249 ridx = a->compressedrow.rindex; 1250 } else { 1251 ii = a->i; 1252 y = yarray; 1253 z = zarray; 1254 } 1255 1256 for (i = 0; i < mbs; i++) { 1257 n = ii[i + 1] - ii[i]; 1258 idx = ij + ii[i]; 1259 1260 if (usecprow) { 1261 y = yarray + 12 * ridx[i]; 1262 z = zarray + 12 * ridx[i]; 1263 } 1264 sum1 = y[0]; 1265 sum2 = y[1]; 1266 sum3 = y[2]; 1267 sum4 = y[3]; 1268 sum5 = y[4]; 1269 sum6 = y[5]; 1270 sum7 = y[6]; 1271 sum8 = y[7]; 1272 sum9 = y[8]; 1273 sum10 = y[9]; 1274 sum11 = y[10]; 1275 sum12 = y[11]; 1276 1277 for (j = 0; j < n; j++) { 1278 xb = x + 12 * (idx[j]); 1279 x1 = xb[0]; 1280 x2 = xb[1]; 1281 x3 = xb[2]; 1282 x4 = xb[3]; 1283 1284 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1285 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1286 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1287 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1288 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1289 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1290 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1291 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1292 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1293 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1294 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1295 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1296 v += 48; 1297 1298 x1 = xb[4]; 1299 x2 = xb[5]; 1300 x3 = xb[6]; 1301 x4 = xb[7]; 1302 1303 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1304 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1305 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1306 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1307 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1308 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1309 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1310 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1311 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1312 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1313 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1314 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1315 v += 48; 1316 1317 x1 = xb[8]; 1318 x2 = xb[9]; 1319 x3 = xb[10]; 1320 x4 = xb[11]; 1321 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1322 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1323 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1324 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1325 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1326 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1327 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1328 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1329 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1330 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1331 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1332 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1333 v += 48; 1334 } 1335 z[0] = sum1; 1336 z[1] = sum2; 1337 z[2] = sum3; 1338 z[3] = sum4; 1339 z[4] = sum5; 1340 z[5] = sum6; 1341 z[6] = sum7; 1342 z[7] = sum8; 1343 z[8] = sum9; 1344 z[9] = sum10; 1345 z[10] = sum11; 1346 z[11] = sum12; 1347 if (!usecprow) { 1348 y += 12; 1349 z += 12; 1350 } 1351 } 1352 PetscCall(VecRestoreArrayRead(xx, &x)); 1353 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1354 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1355 PetscFunctionReturn(0); 1356 } 1357 1358 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1359 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1360 { 1361 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1362 PetscScalar *z = NULL, *zarray; 1363 const PetscScalar *x, *work; 1364 const MatScalar *v = a->a; 1365 PetscInt mbs, i, j, n; 1366 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1367 PetscBool usecprow = a->compressedrow.use; 1368 const PetscInt bs = 12, bs2 = 144; 1369 1370 __m256d a0, a1, a2, a3, a4, a5; 1371 __m256d w0, w1, w2, w3; 1372 __m256d z0, z1, z2; 1373 1374 PetscFunctionBegin; 1375 PetscCall(VecGetArrayRead(xx, &x)); 1376 PetscCall(VecGetArrayWrite(zz, &zarray)); 1377 1378 if (usecprow) { 1379 mbs = a->compressedrow.nrows; 1380 ii = a->compressedrow.i; 1381 ridx = a->compressedrow.rindex; 1382 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1383 } else { 1384 mbs = a->mbs; 1385 ii = a->i; 1386 z = zarray; 1387 } 1388 1389 for (i = 0; i < mbs; i++) { 1390 z0 = _mm256_setzero_pd(); 1391 z1 = _mm256_setzero_pd(); 1392 z2 = _mm256_setzero_pd(); 1393 1394 n = ii[1] - ii[0]; 1395 ii++; 1396 for (j = 0; j < n; j++) { 1397 work = x + bs * (*idx++); 1398 1399 /* first column of a */ 1400 w0 = _mm256_set1_pd(work[0]); 1401 a0 = _mm256_loadu_pd(v + 0); 1402 z0 = _mm256_fmadd_pd(a0, w0, z0); 1403 a1 = _mm256_loadu_pd(v + 4); 1404 z1 = _mm256_fmadd_pd(a1, w0, z1); 1405 a2 = _mm256_loadu_pd(v + 8); 1406 z2 = _mm256_fmadd_pd(a2, w0, z2); 1407 1408 /* second column of a */ 1409 w1 = _mm256_set1_pd(work[1]); 1410 a3 = _mm256_loadu_pd(v + 12); 1411 z0 = _mm256_fmadd_pd(a3, w1, z0); 1412 a4 = _mm256_loadu_pd(v + 16); 1413 z1 = _mm256_fmadd_pd(a4, w1, z1); 1414 a5 = _mm256_loadu_pd(v + 20); 1415 z2 = _mm256_fmadd_pd(a5, w1, z2); 1416 1417 /* third column of a */ 1418 w2 = _mm256_set1_pd(work[2]); 1419 a0 = _mm256_loadu_pd(v + 24); 1420 z0 = _mm256_fmadd_pd(a0, w2, z0); 1421 a1 = _mm256_loadu_pd(v + 28); 1422 z1 = _mm256_fmadd_pd(a1, w2, z1); 1423 a2 = _mm256_loadu_pd(v + 32); 1424 z2 = _mm256_fmadd_pd(a2, w2, z2); 1425 1426 /* fourth column of a */ 1427 w3 = _mm256_set1_pd(work[3]); 1428 a3 = _mm256_loadu_pd(v + 36); 1429 z0 = _mm256_fmadd_pd(a3, w3, z0); 1430 a4 = _mm256_loadu_pd(v + 40); 1431 z1 = _mm256_fmadd_pd(a4, w3, z1); 1432 a5 = _mm256_loadu_pd(v + 44); 1433 z2 = _mm256_fmadd_pd(a5, w3, z2); 1434 1435 /* fifth column of a */ 1436 w0 = _mm256_set1_pd(work[4]); 1437 a0 = _mm256_loadu_pd(v + 48); 1438 z0 = _mm256_fmadd_pd(a0, w0, z0); 1439 a1 = _mm256_loadu_pd(v + 52); 1440 z1 = _mm256_fmadd_pd(a1, w0, z1); 1441 a2 = _mm256_loadu_pd(v + 56); 1442 z2 = _mm256_fmadd_pd(a2, w0, z2); 1443 1444 /* sixth column of a */ 1445 w1 = _mm256_set1_pd(work[5]); 1446 a3 = _mm256_loadu_pd(v + 60); 1447 z0 = _mm256_fmadd_pd(a3, w1, z0); 1448 a4 = _mm256_loadu_pd(v + 64); 1449 z1 = _mm256_fmadd_pd(a4, w1, z1); 1450 a5 = _mm256_loadu_pd(v + 68); 1451 z2 = _mm256_fmadd_pd(a5, w1, z2); 1452 1453 /* seventh column of a */ 1454 w2 = _mm256_set1_pd(work[6]); 1455 a0 = _mm256_loadu_pd(v + 72); 1456 z0 = _mm256_fmadd_pd(a0, w2, z0); 1457 a1 = _mm256_loadu_pd(v + 76); 1458 z1 = _mm256_fmadd_pd(a1, w2, z1); 1459 a2 = _mm256_loadu_pd(v + 80); 1460 z2 = _mm256_fmadd_pd(a2, w2, z2); 1461 1462 /* eighth column of a */ 1463 w3 = _mm256_set1_pd(work[7]); 1464 a3 = _mm256_loadu_pd(v + 84); 1465 z0 = _mm256_fmadd_pd(a3, w3, z0); 1466 a4 = _mm256_loadu_pd(v + 88); 1467 z1 = _mm256_fmadd_pd(a4, w3, z1); 1468 a5 = _mm256_loadu_pd(v + 92); 1469 z2 = _mm256_fmadd_pd(a5, w3, z2); 1470 1471 /* ninth column of a */ 1472 w0 = _mm256_set1_pd(work[8]); 1473 a0 = _mm256_loadu_pd(v + 96); 1474 z0 = _mm256_fmadd_pd(a0, w0, z0); 1475 a1 = _mm256_loadu_pd(v + 100); 1476 z1 = _mm256_fmadd_pd(a1, w0, z1); 1477 a2 = _mm256_loadu_pd(v + 104); 1478 z2 = _mm256_fmadd_pd(a2, w0, z2); 1479 1480 /* tenth column of a */ 1481 w1 = _mm256_set1_pd(work[9]); 1482 a3 = _mm256_loadu_pd(v + 108); 1483 z0 = _mm256_fmadd_pd(a3, w1, z0); 1484 a4 = _mm256_loadu_pd(v + 112); 1485 z1 = _mm256_fmadd_pd(a4, w1, z1); 1486 a5 = _mm256_loadu_pd(v + 116); 1487 z2 = _mm256_fmadd_pd(a5, w1, z2); 1488 1489 /* eleventh column of a */ 1490 w2 = _mm256_set1_pd(work[10]); 1491 a0 = _mm256_loadu_pd(v + 120); 1492 z0 = _mm256_fmadd_pd(a0, w2, z0); 1493 a1 = _mm256_loadu_pd(v + 124); 1494 z1 = _mm256_fmadd_pd(a1, w2, z1); 1495 a2 = _mm256_loadu_pd(v + 128); 1496 z2 = _mm256_fmadd_pd(a2, w2, z2); 1497 1498 /* twelveth column of a */ 1499 w3 = _mm256_set1_pd(work[11]); 1500 a3 = _mm256_loadu_pd(v + 132); 1501 z0 = _mm256_fmadd_pd(a3, w3, z0); 1502 a4 = _mm256_loadu_pd(v + 136); 1503 z1 = _mm256_fmadd_pd(a4, w3, z1); 1504 a5 = _mm256_loadu_pd(v + 140); 1505 z2 = _mm256_fmadd_pd(a5, w3, z2); 1506 1507 v += bs2; 1508 } 1509 if (usecprow) z = zarray + bs * ridx[i]; 1510 _mm256_storeu_pd(&z[0], z0); 1511 _mm256_storeu_pd(&z[4], z1); 1512 _mm256_storeu_pd(&z[8], z2); 1513 if (!usecprow) z += bs; 1514 } 1515 PetscCall(VecRestoreArrayRead(xx, &x)); 1516 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1517 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1518 PetscFunctionReturn(0); 1519 } 1520 #endif 1521 1522 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1523 /* Default MatMult for block size 15 */ 1524 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1525 { 1526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1527 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1528 const PetscScalar *x, *xb; 1529 PetscScalar *zarray, xv; 1530 const MatScalar *v; 1531 const PetscInt *ii, *ij = a->j, *idx; 1532 PetscInt mbs, i, j, k, n, *ridx = NULL; 1533 PetscBool usecprow = a->compressedrow.use; 1534 1535 PetscFunctionBegin; 1536 PetscCall(VecGetArrayRead(xx, &x)); 1537 PetscCall(VecGetArrayWrite(zz, &zarray)); 1538 1539 v = a->a; 1540 if (usecprow) { 1541 mbs = a->compressedrow.nrows; 1542 ii = a->compressedrow.i; 1543 ridx = a->compressedrow.rindex; 1544 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1545 } else { 1546 mbs = a->mbs; 1547 ii = a->i; 1548 z = zarray; 1549 } 1550 1551 for (i = 0; i < mbs; i++) { 1552 n = ii[i + 1] - ii[i]; 1553 idx = ij + ii[i]; 1554 sum1 = 0.0; 1555 sum2 = 0.0; 1556 sum3 = 0.0; 1557 sum4 = 0.0; 1558 sum5 = 0.0; 1559 sum6 = 0.0; 1560 sum7 = 0.0; 1561 sum8 = 0.0; 1562 sum9 = 0.0; 1563 sum10 = 0.0; 1564 sum11 = 0.0; 1565 sum12 = 0.0; 1566 sum13 = 0.0; 1567 sum14 = 0.0; 1568 sum15 = 0.0; 1569 1570 for (j = 0; j < n; j++) { 1571 xb = x + 15 * (idx[j]); 1572 1573 for (k = 0; k < 15; k++) { 1574 xv = xb[k]; 1575 sum1 += v[0] * xv; 1576 sum2 += v[1] * xv; 1577 sum3 += v[2] * xv; 1578 sum4 += v[3] * xv; 1579 sum5 += v[4] * xv; 1580 sum6 += v[5] * xv; 1581 sum7 += v[6] * xv; 1582 sum8 += v[7] * xv; 1583 sum9 += v[8] * xv; 1584 sum10 += v[9] * xv; 1585 sum11 += v[10] * xv; 1586 sum12 += v[11] * xv; 1587 sum13 += v[12] * xv; 1588 sum14 += v[13] * xv; 1589 sum15 += v[14] * xv; 1590 v += 15; 1591 } 1592 } 1593 if (usecprow) z = zarray + 15 * ridx[i]; 1594 z[0] = sum1; 1595 z[1] = sum2; 1596 z[2] = sum3; 1597 z[3] = sum4; 1598 z[4] = sum5; 1599 z[5] = sum6; 1600 z[6] = sum7; 1601 z[7] = sum8; 1602 z[8] = sum9; 1603 z[9] = sum10; 1604 z[10] = sum11; 1605 z[11] = sum12; 1606 z[12] = sum13; 1607 z[13] = sum14; 1608 z[14] = sum15; 1609 1610 if (!usecprow) z += 15; 1611 } 1612 1613 PetscCall(VecRestoreArrayRead(xx, &x)); 1614 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1615 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1616 PetscFunctionReturn(0); 1617 } 1618 1619 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1620 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1621 { 1622 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1623 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1624 const PetscScalar *x, *xb; 1625 PetscScalar x1, x2, x3, x4, *zarray; 1626 const MatScalar *v; 1627 const PetscInt *ii, *ij = a->j, *idx; 1628 PetscInt mbs, i, j, n, *ridx = NULL; 1629 PetscBool usecprow = a->compressedrow.use; 1630 1631 PetscFunctionBegin; 1632 PetscCall(VecGetArrayRead(xx, &x)); 1633 PetscCall(VecGetArrayWrite(zz, &zarray)); 1634 1635 v = a->a; 1636 if (usecprow) { 1637 mbs = a->compressedrow.nrows; 1638 ii = a->compressedrow.i; 1639 ridx = a->compressedrow.rindex; 1640 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1641 } else { 1642 mbs = a->mbs; 1643 ii = a->i; 1644 z = zarray; 1645 } 1646 1647 for (i = 0; i < mbs; i++) { 1648 n = ii[i + 1] - ii[i]; 1649 idx = ij + ii[i]; 1650 sum1 = 0.0; 1651 sum2 = 0.0; 1652 sum3 = 0.0; 1653 sum4 = 0.0; 1654 sum5 = 0.0; 1655 sum6 = 0.0; 1656 sum7 = 0.0; 1657 sum8 = 0.0; 1658 sum9 = 0.0; 1659 sum10 = 0.0; 1660 sum11 = 0.0; 1661 sum12 = 0.0; 1662 sum13 = 0.0; 1663 sum14 = 0.0; 1664 sum15 = 0.0; 1665 1666 for (j = 0; j < n; j++) { 1667 xb = x + 15 * (idx[j]); 1668 x1 = xb[0]; 1669 x2 = xb[1]; 1670 x3 = xb[2]; 1671 x4 = xb[3]; 1672 1673 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1674 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1675 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1676 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1677 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1678 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1679 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1680 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1681 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1682 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1683 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1684 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1685 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1686 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1687 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1688 1689 v += 60; 1690 1691 x1 = xb[4]; 1692 x2 = xb[5]; 1693 x3 = xb[6]; 1694 x4 = xb[7]; 1695 1696 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1697 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1698 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1699 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1700 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1701 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1702 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1703 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1704 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1705 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1706 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1707 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1708 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1709 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1710 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1711 v += 60; 1712 1713 x1 = xb[8]; 1714 x2 = xb[9]; 1715 x3 = xb[10]; 1716 x4 = xb[11]; 1717 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1718 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1719 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1720 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1721 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1722 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1723 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1724 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1725 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1726 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1727 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1728 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1729 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1730 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1731 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1732 v += 60; 1733 1734 x1 = xb[12]; 1735 x2 = xb[13]; 1736 x3 = xb[14]; 1737 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1738 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1739 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1740 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1741 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1742 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1743 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1744 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1745 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1746 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1747 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1748 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1749 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1750 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1751 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1752 v += 45; 1753 } 1754 if (usecprow) z = zarray + 15 * ridx[i]; 1755 z[0] = sum1; 1756 z[1] = sum2; 1757 z[2] = sum3; 1758 z[3] = sum4; 1759 z[4] = sum5; 1760 z[5] = sum6; 1761 z[6] = sum7; 1762 z[7] = sum8; 1763 z[8] = sum9; 1764 z[9] = sum10; 1765 z[10] = sum11; 1766 z[11] = sum12; 1767 z[12] = sum13; 1768 z[13] = sum14; 1769 z[14] = sum15; 1770 1771 if (!usecprow) z += 15; 1772 } 1773 1774 PetscCall(VecRestoreArrayRead(xx, &x)); 1775 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1776 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1777 PetscFunctionReturn(0); 1778 } 1779 1780 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1781 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1782 { 1783 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1785 const PetscScalar *x, *xb; 1786 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1787 const MatScalar *v; 1788 const PetscInt *ii, *ij = a->j, *idx; 1789 PetscInt mbs, i, j, n, *ridx = NULL; 1790 PetscBool usecprow = a->compressedrow.use; 1791 1792 PetscFunctionBegin; 1793 PetscCall(VecGetArrayRead(xx, &x)); 1794 PetscCall(VecGetArrayWrite(zz, &zarray)); 1795 1796 v = a->a; 1797 if (usecprow) { 1798 mbs = a->compressedrow.nrows; 1799 ii = a->compressedrow.i; 1800 ridx = a->compressedrow.rindex; 1801 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1802 } else { 1803 mbs = a->mbs; 1804 ii = a->i; 1805 z = zarray; 1806 } 1807 1808 for (i = 0; i < mbs; i++) { 1809 n = ii[i + 1] - ii[i]; 1810 idx = ij + ii[i]; 1811 sum1 = 0.0; 1812 sum2 = 0.0; 1813 sum3 = 0.0; 1814 sum4 = 0.0; 1815 sum5 = 0.0; 1816 sum6 = 0.0; 1817 sum7 = 0.0; 1818 sum8 = 0.0; 1819 sum9 = 0.0; 1820 sum10 = 0.0; 1821 sum11 = 0.0; 1822 sum12 = 0.0; 1823 sum13 = 0.0; 1824 sum14 = 0.0; 1825 sum15 = 0.0; 1826 1827 for (j = 0; j < n; j++) { 1828 xb = x + 15 * (idx[j]); 1829 x1 = xb[0]; 1830 x2 = xb[1]; 1831 x3 = xb[2]; 1832 x4 = xb[3]; 1833 x5 = xb[4]; 1834 x6 = xb[5]; 1835 x7 = xb[6]; 1836 x8 = xb[7]; 1837 1838 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1839 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1840 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1841 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1842 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1843 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1844 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1845 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1846 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1847 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1848 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1849 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1850 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1851 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1852 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1853 v += 120; 1854 1855 x1 = xb[8]; 1856 x2 = xb[9]; 1857 x3 = xb[10]; 1858 x4 = xb[11]; 1859 x5 = xb[12]; 1860 x6 = xb[13]; 1861 x7 = xb[14]; 1862 1863 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1864 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1865 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1866 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1867 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1868 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1869 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1870 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1871 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1872 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1873 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1874 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1875 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1876 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1877 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1878 v += 105; 1879 } 1880 if (usecprow) z = zarray + 15 * ridx[i]; 1881 z[0] = sum1; 1882 z[1] = sum2; 1883 z[2] = sum3; 1884 z[3] = sum4; 1885 z[4] = sum5; 1886 z[5] = sum6; 1887 z[6] = sum7; 1888 z[7] = sum8; 1889 z[8] = sum9; 1890 z[9] = sum10; 1891 z[10] = sum11; 1892 z[11] = sum12; 1893 z[12] = sum13; 1894 z[13] = sum14; 1895 z[14] = sum15; 1896 1897 if (!usecprow) z += 15; 1898 } 1899 1900 PetscCall(VecRestoreArrayRead(xx, &x)); 1901 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1902 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1903 PetscFunctionReturn(0); 1904 } 1905 1906 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1907 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1908 { 1909 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1910 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1911 const PetscScalar *x, *xb; 1912 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1913 const MatScalar *v; 1914 const PetscInt *ii, *ij = a->j, *idx; 1915 PetscInt mbs, i, j, n, *ridx = NULL; 1916 PetscBool usecprow = a->compressedrow.use; 1917 1918 PetscFunctionBegin; 1919 PetscCall(VecGetArrayRead(xx, &x)); 1920 PetscCall(VecGetArrayWrite(zz, &zarray)); 1921 1922 v = a->a; 1923 if (usecprow) { 1924 mbs = a->compressedrow.nrows; 1925 ii = a->compressedrow.i; 1926 ridx = a->compressedrow.rindex; 1927 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1928 } else { 1929 mbs = a->mbs; 1930 ii = a->i; 1931 z = zarray; 1932 } 1933 1934 for (i = 0; i < mbs; i++) { 1935 n = ii[i + 1] - ii[i]; 1936 idx = ij + ii[i]; 1937 sum1 = 0.0; 1938 sum2 = 0.0; 1939 sum3 = 0.0; 1940 sum4 = 0.0; 1941 sum5 = 0.0; 1942 sum6 = 0.0; 1943 sum7 = 0.0; 1944 sum8 = 0.0; 1945 sum9 = 0.0; 1946 sum10 = 0.0; 1947 sum11 = 0.0; 1948 sum12 = 0.0; 1949 sum13 = 0.0; 1950 sum14 = 0.0; 1951 sum15 = 0.0; 1952 1953 for (j = 0; j < n; j++) { 1954 xb = x + 15 * (idx[j]); 1955 x1 = xb[0]; 1956 x2 = xb[1]; 1957 x3 = xb[2]; 1958 x4 = xb[3]; 1959 x5 = xb[4]; 1960 x6 = xb[5]; 1961 x7 = xb[6]; 1962 x8 = xb[7]; 1963 x9 = xb[8]; 1964 x10 = xb[9]; 1965 x11 = xb[10]; 1966 x12 = xb[11]; 1967 x13 = xb[12]; 1968 x14 = xb[13]; 1969 x15 = xb[14]; 1970 1971 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1972 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1973 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1974 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1975 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1976 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1977 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1978 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1979 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1980 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1981 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1982 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1983 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1984 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1985 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1986 v += 225; 1987 } 1988 if (usecprow) z = zarray + 15 * ridx[i]; 1989 z[0] = sum1; 1990 z[1] = sum2; 1991 z[2] = sum3; 1992 z[3] = sum4; 1993 z[4] = sum5; 1994 z[5] = sum6; 1995 z[6] = sum7; 1996 z[7] = sum8; 1997 z[8] = sum9; 1998 z[9] = sum10; 1999 z[10] = sum11; 2000 z[11] = sum12; 2001 z[12] = sum13; 2002 z[13] = sum14; 2003 z[14] = sum15; 2004 2005 if (!usecprow) z += 15; 2006 } 2007 2008 PetscCall(VecRestoreArrayRead(xx, &x)); 2009 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2010 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 2011 PetscFunctionReturn(0); 2012 } 2013 2014 /* 2015 This will not work with MatScalar == float because it calls the BLAS 2016 */ 2017 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 2018 { 2019 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2020 PetscScalar *z = NULL, *work, *workt, *zarray; 2021 const PetscScalar *x, *xb; 2022 const MatScalar *v; 2023 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2024 const PetscInt *idx, *ii, *ridx = NULL; 2025 PetscInt ncols, k; 2026 PetscBool usecprow = a->compressedrow.use; 2027 2028 PetscFunctionBegin; 2029 PetscCall(VecGetArrayRead(xx, &x)); 2030 PetscCall(VecGetArrayWrite(zz, &zarray)); 2031 2032 idx = a->j; 2033 v = a->a; 2034 if (usecprow) { 2035 mbs = a->compressedrow.nrows; 2036 ii = a->compressedrow.i; 2037 ridx = a->compressedrow.rindex; 2038 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2039 } else { 2040 mbs = a->mbs; 2041 ii = a->i; 2042 z = zarray; 2043 } 2044 2045 if (!a->mult_work) { 2046 k = PetscMax(A->rmap->n, A->cmap->n); 2047 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2048 } 2049 work = a->mult_work; 2050 for (i = 0; i < mbs; i++) { 2051 n = ii[1] - ii[0]; 2052 ii++; 2053 ncols = n * bs; 2054 workt = work; 2055 for (j = 0; j < n; j++) { 2056 xb = x + bs * (*idx++); 2057 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2058 workt += bs; 2059 } 2060 if (usecprow) z = zarray + bs * ridx[i]; 2061 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2062 v += n * bs2; 2063 if (!usecprow) z += bs; 2064 } 2065 PetscCall(VecRestoreArrayRead(xx, &x)); 2066 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2067 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2068 PetscFunctionReturn(0); 2069 } 2070 2071 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2072 { 2073 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2074 const PetscScalar *x; 2075 PetscScalar *y, *z, sum; 2076 const MatScalar *v; 2077 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2078 const PetscInt *idx, *ii; 2079 PetscBool usecprow = a->compressedrow.use; 2080 2081 PetscFunctionBegin; 2082 PetscCall(VecGetArrayRead(xx, &x)); 2083 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2084 2085 idx = a->j; 2086 v = a->a; 2087 if (usecprow) { 2088 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 2089 mbs = a->compressedrow.nrows; 2090 ii = a->compressedrow.i; 2091 ridx = a->compressedrow.rindex; 2092 } else { 2093 ii = a->i; 2094 } 2095 2096 for (i = 0; i < mbs; i++) { 2097 n = ii[1] - ii[0]; 2098 ii++; 2099 if (!usecprow) { 2100 sum = y[i]; 2101 } else { 2102 sum = y[ridx[i]]; 2103 } 2104 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2105 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2106 PetscSparseDensePlusDot(sum, x, v, idx, n); 2107 v += n; 2108 idx += n; 2109 if (usecprow) { 2110 z[ridx[i]] = sum; 2111 } else { 2112 z[i] = sum; 2113 } 2114 } 2115 PetscCall(VecRestoreArrayRead(xx, &x)); 2116 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2117 PetscCall(PetscLogFlops(2.0 * a->nz)); 2118 PetscFunctionReturn(0); 2119 } 2120 2121 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2122 { 2123 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2124 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2125 const PetscScalar *x, *xb; 2126 PetscScalar x1, x2, *yarray, *zarray; 2127 const MatScalar *v; 2128 PetscInt mbs = a->mbs, i, n, j; 2129 const PetscInt *idx, *ii, *ridx = NULL; 2130 PetscBool usecprow = a->compressedrow.use; 2131 2132 PetscFunctionBegin; 2133 PetscCall(VecGetArrayRead(xx, &x)); 2134 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2135 2136 idx = a->j; 2137 v = a->a; 2138 if (usecprow) { 2139 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 2140 mbs = a->compressedrow.nrows; 2141 ii = a->compressedrow.i; 2142 ridx = a->compressedrow.rindex; 2143 } else { 2144 ii = a->i; 2145 y = yarray; 2146 z = zarray; 2147 } 2148 2149 for (i = 0; i < mbs; i++) { 2150 n = ii[1] - ii[0]; 2151 ii++; 2152 if (usecprow) { 2153 z = zarray + 2 * ridx[i]; 2154 y = yarray + 2 * ridx[i]; 2155 } 2156 sum1 = y[0]; 2157 sum2 = y[1]; 2158 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2159 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2160 for (j = 0; j < n; j++) { 2161 xb = x + 2 * (*idx++); 2162 x1 = xb[0]; 2163 x2 = xb[1]; 2164 2165 sum1 += v[0] * x1 + v[2] * x2; 2166 sum2 += v[1] * x1 + v[3] * x2; 2167 v += 4; 2168 } 2169 z[0] = sum1; 2170 z[1] = sum2; 2171 if (!usecprow) { 2172 z += 2; 2173 y += 2; 2174 } 2175 } 2176 PetscCall(VecRestoreArrayRead(xx, &x)); 2177 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2178 PetscCall(PetscLogFlops(4.0 * a->nz)); 2179 PetscFunctionReturn(0); 2180 } 2181 2182 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2183 { 2184 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2185 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2186 const PetscScalar *x, *xb; 2187 const MatScalar *v; 2188 PetscInt mbs = a->mbs, i, j, n; 2189 const PetscInt *idx, *ii, *ridx = NULL; 2190 PetscBool usecprow = a->compressedrow.use; 2191 2192 PetscFunctionBegin; 2193 PetscCall(VecGetArrayRead(xx, &x)); 2194 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2195 2196 idx = a->j; 2197 v = a->a; 2198 if (usecprow) { 2199 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 2200 mbs = a->compressedrow.nrows; 2201 ii = a->compressedrow.i; 2202 ridx = a->compressedrow.rindex; 2203 } else { 2204 ii = a->i; 2205 y = yarray; 2206 z = zarray; 2207 } 2208 2209 for (i = 0; i < mbs; i++) { 2210 n = ii[1] - ii[0]; 2211 ii++; 2212 if (usecprow) { 2213 z = zarray + 3 * ridx[i]; 2214 y = yarray + 3 * ridx[i]; 2215 } 2216 sum1 = y[0]; 2217 sum2 = y[1]; 2218 sum3 = y[2]; 2219 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2220 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2221 for (j = 0; j < n; j++) { 2222 xb = x + 3 * (*idx++); 2223 x1 = xb[0]; 2224 x2 = xb[1]; 2225 x3 = xb[2]; 2226 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2227 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2228 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2229 v += 9; 2230 } 2231 z[0] = sum1; 2232 z[1] = sum2; 2233 z[2] = sum3; 2234 if (!usecprow) { 2235 z += 3; 2236 y += 3; 2237 } 2238 } 2239 PetscCall(VecRestoreArrayRead(xx, &x)); 2240 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2241 PetscCall(PetscLogFlops(18.0 * a->nz)); 2242 PetscFunctionReturn(0); 2243 } 2244 2245 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2246 { 2247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2248 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2249 const PetscScalar *x, *xb; 2250 const MatScalar *v; 2251 PetscInt mbs = a->mbs, i, j, n; 2252 const PetscInt *idx, *ii, *ridx = NULL; 2253 PetscBool usecprow = a->compressedrow.use; 2254 2255 PetscFunctionBegin; 2256 PetscCall(VecGetArrayRead(xx, &x)); 2257 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2258 2259 idx = a->j; 2260 v = a->a; 2261 if (usecprow) { 2262 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 2263 mbs = a->compressedrow.nrows; 2264 ii = a->compressedrow.i; 2265 ridx = a->compressedrow.rindex; 2266 } else { 2267 ii = a->i; 2268 y = yarray; 2269 z = zarray; 2270 } 2271 2272 for (i = 0; i < mbs; i++) { 2273 n = ii[1] - ii[0]; 2274 ii++; 2275 if (usecprow) { 2276 z = zarray + 4 * ridx[i]; 2277 y = yarray + 4 * ridx[i]; 2278 } 2279 sum1 = y[0]; 2280 sum2 = y[1]; 2281 sum3 = y[2]; 2282 sum4 = y[3]; 2283 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2284 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2285 for (j = 0; j < n; j++) { 2286 xb = x + 4 * (*idx++); 2287 x1 = xb[0]; 2288 x2 = xb[1]; 2289 x3 = xb[2]; 2290 x4 = xb[3]; 2291 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2292 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2293 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2294 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2295 v += 16; 2296 } 2297 z[0] = sum1; 2298 z[1] = sum2; 2299 z[2] = sum3; 2300 z[3] = sum4; 2301 if (!usecprow) { 2302 z += 4; 2303 y += 4; 2304 } 2305 } 2306 PetscCall(VecRestoreArrayRead(xx, &x)); 2307 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2308 PetscCall(PetscLogFlops(32.0 * a->nz)); 2309 PetscFunctionReturn(0); 2310 } 2311 2312 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2313 { 2314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2315 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2316 const PetscScalar *x, *xb; 2317 PetscScalar *yarray, *zarray; 2318 const MatScalar *v; 2319 PetscInt mbs = a->mbs, i, j, n; 2320 const PetscInt *idx, *ii, *ridx = NULL; 2321 PetscBool usecprow = a->compressedrow.use; 2322 2323 PetscFunctionBegin; 2324 PetscCall(VecGetArrayRead(xx, &x)); 2325 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2326 2327 idx = a->j; 2328 v = a->a; 2329 if (usecprow) { 2330 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 2331 mbs = a->compressedrow.nrows; 2332 ii = a->compressedrow.i; 2333 ridx = a->compressedrow.rindex; 2334 } else { 2335 ii = a->i; 2336 y = yarray; 2337 z = zarray; 2338 } 2339 2340 for (i = 0; i < mbs; i++) { 2341 n = ii[1] - ii[0]; 2342 ii++; 2343 if (usecprow) { 2344 z = zarray + 5 * ridx[i]; 2345 y = yarray + 5 * ridx[i]; 2346 } 2347 sum1 = y[0]; 2348 sum2 = y[1]; 2349 sum3 = y[2]; 2350 sum4 = y[3]; 2351 sum5 = y[4]; 2352 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2353 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2354 for (j = 0; j < n; j++) { 2355 xb = x + 5 * (*idx++); 2356 x1 = xb[0]; 2357 x2 = xb[1]; 2358 x3 = xb[2]; 2359 x4 = xb[3]; 2360 x5 = xb[4]; 2361 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2362 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2363 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2364 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2365 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2366 v += 25; 2367 } 2368 z[0] = sum1; 2369 z[1] = sum2; 2370 z[2] = sum3; 2371 z[3] = sum4; 2372 z[4] = sum5; 2373 if (!usecprow) { 2374 z += 5; 2375 y += 5; 2376 } 2377 } 2378 PetscCall(VecRestoreArrayRead(xx, &x)); 2379 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2380 PetscCall(PetscLogFlops(50.0 * a->nz)); 2381 PetscFunctionReturn(0); 2382 } 2383 2384 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2385 { 2386 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2387 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2388 const PetscScalar *x, *xb; 2389 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2390 const MatScalar *v; 2391 PetscInt mbs = a->mbs, i, j, n; 2392 const PetscInt *idx, *ii, *ridx = NULL; 2393 PetscBool usecprow = a->compressedrow.use; 2394 2395 PetscFunctionBegin; 2396 PetscCall(VecGetArrayRead(xx, &x)); 2397 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2398 2399 idx = a->j; 2400 v = a->a; 2401 if (usecprow) { 2402 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 2403 mbs = a->compressedrow.nrows; 2404 ii = a->compressedrow.i; 2405 ridx = a->compressedrow.rindex; 2406 } else { 2407 ii = a->i; 2408 y = yarray; 2409 z = zarray; 2410 } 2411 2412 for (i = 0; i < mbs; i++) { 2413 n = ii[1] - ii[0]; 2414 ii++; 2415 if (usecprow) { 2416 z = zarray + 6 * ridx[i]; 2417 y = yarray + 6 * ridx[i]; 2418 } 2419 sum1 = y[0]; 2420 sum2 = y[1]; 2421 sum3 = y[2]; 2422 sum4 = y[3]; 2423 sum5 = y[4]; 2424 sum6 = y[5]; 2425 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2426 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2427 for (j = 0; j < n; j++) { 2428 xb = x + 6 * (*idx++); 2429 x1 = xb[0]; 2430 x2 = xb[1]; 2431 x3 = xb[2]; 2432 x4 = xb[3]; 2433 x5 = xb[4]; 2434 x6 = xb[5]; 2435 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2436 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2437 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2438 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2439 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2440 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2441 v += 36; 2442 } 2443 z[0] = sum1; 2444 z[1] = sum2; 2445 z[2] = sum3; 2446 z[3] = sum4; 2447 z[4] = sum5; 2448 z[5] = sum6; 2449 if (!usecprow) { 2450 z += 6; 2451 y += 6; 2452 } 2453 } 2454 PetscCall(VecRestoreArrayRead(xx, &x)); 2455 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2456 PetscCall(PetscLogFlops(72.0 * a->nz)); 2457 PetscFunctionReturn(0); 2458 } 2459 2460 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2461 { 2462 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2463 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2464 const PetscScalar *x, *xb; 2465 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2466 const MatScalar *v; 2467 PetscInt mbs = a->mbs, i, j, n; 2468 const PetscInt *idx, *ii, *ridx = NULL; 2469 PetscBool usecprow = a->compressedrow.use; 2470 2471 PetscFunctionBegin; 2472 PetscCall(VecGetArrayRead(xx, &x)); 2473 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2474 2475 idx = a->j; 2476 v = a->a; 2477 if (usecprow) { 2478 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2479 mbs = a->compressedrow.nrows; 2480 ii = a->compressedrow.i; 2481 ridx = a->compressedrow.rindex; 2482 } else { 2483 ii = a->i; 2484 y = yarray; 2485 z = zarray; 2486 } 2487 2488 for (i = 0; i < mbs; i++) { 2489 n = ii[1] - ii[0]; 2490 ii++; 2491 if (usecprow) { 2492 z = zarray + 7 * ridx[i]; 2493 y = yarray + 7 * ridx[i]; 2494 } 2495 sum1 = y[0]; 2496 sum2 = y[1]; 2497 sum3 = y[2]; 2498 sum4 = y[3]; 2499 sum5 = y[4]; 2500 sum6 = y[5]; 2501 sum7 = y[6]; 2502 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2503 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2504 for (j = 0; j < n; j++) { 2505 xb = x + 7 * (*idx++); 2506 x1 = xb[0]; 2507 x2 = xb[1]; 2508 x3 = xb[2]; 2509 x4 = xb[3]; 2510 x5 = xb[4]; 2511 x6 = xb[5]; 2512 x7 = xb[6]; 2513 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2514 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2515 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2516 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2517 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2518 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2519 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2520 v += 49; 2521 } 2522 z[0] = sum1; 2523 z[1] = sum2; 2524 z[2] = sum3; 2525 z[3] = sum4; 2526 z[4] = sum5; 2527 z[5] = sum6; 2528 z[6] = sum7; 2529 if (!usecprow) { 2530 z += 7; 2531 y += 7; 2532 } 2533 } 2534 PetscCall(VecRestoreArrayRead(xx, &x)); 2535 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2536 PetscCall(PetscLogFlops(98.0 * a->nz)); 2537 PetscFunctionReturn(0); 2538 } 2539 2540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2541 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2542 { 2543 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2544 PetscScalar *z = NULL, *work, *workt, *zarray; 2545 const PetscScalar *x, *xb; 2546 const MatScalar *v; 2547 PetscInt mbs, i, j, n; 2548 PetscInt k; 2549 PetscBool usecprow = a->compressedrow.use; 2550 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2551 2552 __m256d a0, a1, a2, a3, a4, a5; 2553 __m256d w0, w1, w2, w3; 2554 __m256d z0, z1, z2; 2555 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2556 2557 PetscFunctionBegin; 2558 PetscCall(VecCopy(yy, zz)); 2559 PetscCall(VecGetArrayRead(xx, &x)); 2560 PetscCall(VecGetArray(zz, &zarray)); 2561 2562 idx = a->j; 2563 v = a->a; 2564 if (usecprow) { 2565 mbs = a->compressedrow.nrows; 2566 ii = a->compressedrow.i; 2567 ridx = a->compressedrow.rindex; 2568 } else { 2569 mbs = a->mbs; 2570 ii = a->i; 2571 z = zarray; 2572 } 2573 2574 if (!a->mult_work) { 2575 k = PetscMax(A->rmap->n, A->cmap->n); 2576 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2577 } 2578 2579 work = a->mult_work; 2580 for (i = 0; i < mbs; i++) { 2581 n = ii[1] - ii[0]; 2582 ii++; 2583 workt = work; 2584 for (j = 0; j < n; j++) { 2585 xb = x + bs * (*idx++); 2586 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2587 workt += bs; 2588 } 2589 if (usecprow) z = zarray + bs * ridx[i]; 2590 2591 z0 = _mm256_loadu_pd(&z[0]); 2592 z1 = _mm256_loadu_pd(&z[4]); 2593 z2 = _mm256_set1_pd(z[8]); 2594 2595 for (j = 0; j < n; j++) { 2596 /* first column of a */ 2597 w0 = _mm256_set1_pd(work[j * 9]); 2598 a0 = _mm256_loadu_pd(&v[j * 81]); 2599 z0 = _mm256_fmadd_pd(a0, w0, z0); 2600 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2601 z1 = _mm256_fmadd_pd(a1, w0, z1); 2602 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2603 z2 = _mm256_fmadd_pd(a2, w0, z2); 2604 2605 /* second column of a */ 2606 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2607 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2608 z0 = _mm256_fmadd_pd(a0, w1, z0); 2609 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2610 z1 = _mm256_fmadd_pd(a1, w1, z1); 2611 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2612 z2 = _mm256_fmadd_pd(a2, w1, z2); 2613 2614 /* third column of a */ 2615 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2616 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2617 z0 = _mm256_fmadd_pd(a3, w2, z0); 2618 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2619 z1 = _mm256_fmadd_pd(a4, w2, z1); 2620 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2621 z2 = _mm256_fmadd_pd(a5, w2, z2); 2622 2623 /* fourth column of a */ 2624 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2625 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2626 z0 = _mm256_fmadd_pd(a0, w3, z0); 2627 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2628 z1 = _mm256_fmadd_pd(a1, w3, z1); 2629 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2630 z2 = _mm256_fmadd_pd(a2, w3, z2); 2631 2632 /* fifth column of a */ 2633 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2634 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2635 z0 = _mm256_fmadd_pd(a3, w0, z0); 2636 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2637 z1 = _mm256_fmadd_pd(a4, w0, z1); 2638 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2639 z2 = _mm256_fmadd_pd(a5, w0, z2); 2640 2641 /* sixth column of a */ 2642 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2643 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2644 z0 = _mm256_fmadd_pd(a0, w1, z0); 2645 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2646 z1 = _mm256_fmadd_pd(a1, w1, z1); 2647 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2648 z2 = _mm256_fmadd_pd(a2, w1, z2); 2649 2650 /* seventh column of a */ 2651 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2652 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2653 z0 = _mm256_fmadd_pd(a0, w2, z0); 2654 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2655 z1 = _mm256_fmadd_pd(a1, w2, z1); 2656 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2657 z2 = _mm256_fmadd_pd(a2, w2, z2); 2658 2659 /* eighth column of a */ 2660 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2661 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2662 z0 = _mm256_fmadd_pd(a3, w3, z0); 2663 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2664 z1 = _mm256_fmadd_pd(a4, w3, z1); 2665 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2666 z2 = _mm256_fmadd_pd(a5, w3, z2); 2667 2668 /* ninth column of a */ 2669 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2670 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2671 z0 = _mm256_fmadd_pd(a0, w0, z0); 2672 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2673 z1 = _mm256_fmadd_pd(a1, w0, z1); 2674 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2675 z2 = _mm256_fmadd_pd(a2, w0, z2); 2676 } 2677 2678 _mm256_storeu_pd(&z[0], z0); 2679 _mm256_storeu_pd(&z[4], z1); 2680 _mm256_maskstore_pd(&z[8], mask1, z2); 2681 2682 v += n * bs2; 2683 if (!usecprow) z += bs; 2684 } 2685 PetscCall(VecRestoreArrayRead(xx, &x)); 2686 PetscCall(VecRestoreArray(zz, &zarray)); 2687 PetscCall(PetscLogFlops(162.0 * a->nz)); 2688 PetscFunctionReturn(0); 2689 } 2690 #endif 2691 2692 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2693 { 2694 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2695 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2696 const PetscScalar *x, *xb; 2697 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2698 const MatScalar *v; 2699 PetscInt mbs = a->mbs, i, j, n; 2700 const PetscInt *idx, *ii, *ridx = NULL; 2701 PetscBool usecprow = a->compressedrow.use; 2702 2703 PetscFunctionBegin; 2704 PetscCall(VecGetArrayRead(xx, &x)); 2705 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2706 2707 idx = a->j; 2708 v = a->a; 2709 if (usecprow) { 2710 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2711 mbs = a->compressedrow.nrows; 2712 ii = a->compressedrow.i; 2713 ridx = a->compressedrow.rindex; 2714 } else { 2715 ii = a->i; 2716 y = yarray; 2717 z = zarray; 2718 } 2719 2720 for (i = 0; i < mbs; i++) { 2721 n = ii[1] - ii[0]; 2722 ii++; 2723 if (usecprow) { 2724 z = zarray + 11 * ridx[i]; 2725 y = yarray + 11 * ridx[i]; 2726 } 2727 sum1 = y[0]; 2728 sum2 = y[1]; 2729 sum3 = y[2]; 2730 sum4 = y[3]; 2731 sum5 = y[4]; 2732 sum6 = y[5]; 2733 sum7 = y[6]; 2734 sum8 = y[7]; 2735 sum9 = y[8]; 2736 sum10 = y[9]; 2737 sum11 = y[10]; 2738 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2739 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2740 for (j = 0; j < n; j++) { 2741 xb = x + 11 * (*idx++); 2742 x1 = xb[0]; 2743 x2 = xb[1]; 2744 x3 = xb[2]; 2745 x4 = xb[3]; 2746 x5 = xb[4]; 2747 x6 = xb[5]; 2748 x7 = xb[6]; 2749 x8 = xb[7]; 2750 x9 = xb[8]; 2751 x10 = xb[9]; 2752 x11 = xb[10]; 2753 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2754 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2755 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2756 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2757 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2758 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2759 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2760 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2761 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2762 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2763 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2764 v += 121; 2765 } 2766 z[0] = sum1; 2767 z[1] = sum2; 2768 z[2] = sum3; 2769 z[3] = sum4; 2770 z[4] = sum5; 2771 z[5] = sum6; 2772 z[6] = sum7; 2773 z[7] = sum8; 2774 z[8] = sum9; 2775 z[9] = sum10; 2776 z[10] = sum11; 2777 if (!usecprow) { 2778 z += 11; 2779 y += 11; 2780 } 2781 } 2782 PetscCall(VecRestoreArrayRead(xx, &x)); 2783 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2784 PetscCall(PetscLogFlops(242.0 * a->nz)); 2785 PetscFunctionReturn(0); 2786 } 2787 2788 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2789 { 2790 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2791 PetscScalar *z = NULL, *work, *workt, *zarray; 2792 const PetscScalar *x, *xb; 2793 const MatScalar *v; 2794 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2795 PetscInt ncols, k; 2796 const PetscInt *ridx = NULL, *idx, *ii; 2797 PetscBool usecprow = a->compressedrow.use; 2798 2799 PetscFunctionBegin; 2800 PetscCall(VecCopy(yy, zz)); 2801 PetscCall(VecGetArrayRead(xx, &x)); 2802 PetscCall(VecGetArray(zz, &zarray)); 2803 2804 idx = a->j; 2805 v = a->a; 2806 if (usecprow) { 2807 mbs = a->compressedrow.nrows; 2808 ii = a->compressedrow.i; 2809 ridx = a->compressedrow.rindex; 2810 } else { 2811 mbs = a->mbs; 2812 ii = a->i; 2813 z = zarray; 2814 } 2815 2816 if (!a->mult_work) { 2817 k = PetscMax(A->rmap->n, A->cmap->n); 2818 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2819 } 2820 work = a->mult_work; 2821 for (i = 0; i < mbs; i++) { 2822 n = ii[1] - ii[0]; 2823 ii++; 2824 ncols = n * bs; 2825 workt = work; 2826 for (j = 0; j < n; j++) { 2827 xb = x + bs * (*idx++); 2828 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2829 workt += bs; 2830 } 2831 if (usecprow) z = zarray + bs * ridx[i]; 2832 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2833 v += n * bs2; 2834 if (!usecprow) z += bs; 2835 } 2836 PetscCall(VecRestoreArrayRead(xx, &x)); 2837 PetscCall(VecRestoreArray(zz, &zarray)); 2838 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2839 PetscFunctionReturn(0); 2840 } 2841 2842 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2843 { 2844 PetscScalar zero = 0.0; 2845 2846 PetscFunctionBegin; 2847 PetscCall(VecSet(zz, zero)); 2848 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2849 PetscFunctionReturn(0); 2850 } 2851 2852 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2853 { 2854 PetscScalar zero = 0.0; 2855 2856 PetscFunctionBegin; 2857 PetscCall(VecSet(zz, zero)); 2858 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2859 PetscFunctionReturn(0); 2860 } 2861 2862 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2863 { 2864 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2865 PetscScalar *z, x1, x2, x3, x4, x5; 2866 const PetscScalar *x, *xb = NULL; 2867 const MatScalar *v; 2868 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2869 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2870 Mat_CompressedRow cprow = a->compressedrow; 2871 PetscBool usecprow = cprow.use; 2872 2873 PetscFunctionBegin; 2874 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2875 PetscCall(VecGetArrayRead(xx, &x)); 2876 PetscCall(VecGetArray(zz, &z)); 2877 2878 idx = a->j; 2879 v = a->a; 2880 if (usecprow) { 2881 mbs = cprow.nrows; 2882 ii = cprow.i; 2883 ridx = cprow.rindex; 2884 } else { 2885 mbs = a->mbs; 2886 ii = a->i; 2887 xb = x; 2888 } 2889 2890 switch (bs) { 2891 case 1: 2892 for (i = 0; i < mbs; i++) { 2893 if (usecprow) xb = x + ridx[i]; 2894 x1 = xb[0]; 2895 ib = idx + ii[0]; 2896 n = ii[1] - ii[0]; 2897 ii++; 2898 for (j = 0; j < n; j++) { 2899 rval = ib[j]; 2900 z[rval] += PetscConj(*v) * x1; 2901 v++; 2902 } 2903 if (!usecprow) xb++; 2904 } 2905 break; 2906 case 2: 2907 for (i = 0; i < mbs; i++) { 2908 if (usecprow) xb = x + 2 * ridx[i]; 2909 x1 = xb[0]; 2910 x2 = xb[1]; 2911 ib = idx + ii[0]; 2912 n = ii[1] - ii[0]; 2913 ii++; 2914 for (j = 0; j < n; j++) { 2915 rval = ib[j] * 2; 2916 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2917 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2918 v += 4; 2919 } 2920 if (!usecprow) xb += 2; 2921 } 2922 break; 2923 case 3: 2924 for (i = 0; i < mbs; i++) { 2925 if (usecprow) xb = x + 3 * ridx[i]; 2926 x1 = xb[0]; 2927 x2 = xb[1]; 2928 x3 = xb[2]; 2929 ib = idx + ii[0]; 2930 n = ii[1] - ii[0]; 2931 ii++; 2932 for (j = 0; j < n; j++) { 2933 rval = ib[j] * 3; 2934 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2935 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2936 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2937 v += 9; 2938 } 2939 if (!usecprow) xb += 3; 2940 } 2941 break; 2942 case 4: 2943 for (i = 0; i < mbs; i++) { 2944 if (usecprow) xb = x + 4 * ridx[i]; 2945 x1 = xb[0]; 2946 x2 = xb[1]; 2947 x3 = xb[2]; 2948 x4 = xb[3]; 2949 ib = idx + ii[0]; 2950 n = ii[1] - ii[0]; 2951 ii++; 2952 for (j = 0; j < n; j++) { 2953 rval = ib[j] * 4; 2954 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2955 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2956 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2957 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2958 v += 16; 2959 } 2960 if (!usecprow) xb += 4; 2961 } 2962 break; 2963 case 5: 2964 for (i = 0; i < mbs; i++) { 2965 if (usecprow) xb = x + 5 * ridx[i]; 2966 x1 = xb[0]; 2967 x2 = xb[1]; 2968 x3 = xb[2]; 2969 x4 = xb[3]; 2970 x5 = xb[4]; 2971 ib = idx + ii[0]; 2972 n = ii[1] - ii[0]; 2973 ii++; 2974 for (j = 0; j < n; j++) { 2975 rval = ib[j] * 5; 2976 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2977 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2978 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2979 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2980 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2981 v += 25; 2982 } 2983 if (!usecprow) xb += 5; 2984 } 2985 break; 2986 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2987 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2988 #if 0 2989 { 2990 PetscInt ncols,k,bs2=a->bs2; 2991 PetscScalar *work,*workt,zb; 2992 const PetscScalar *xtmp; 2993 if (!a->mult_work) { 2994 k = PetscMax(A->rmap->n,A->cmap->n); 2995 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2996 } 2997 work = a->mult_work; 2998 xtmp = x; 2999 for (i=0; i<mbs; i++) { 3000 n = ii[1] - ii[0]; ii++; 3001 ncols = n*bs; 3002 PetscCall(PetscArrayzero(work,ncols)); 3003 if (usecprow) xtmp = x + bs*ridx[i]; 3004 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 3005 v += n*bs2; 3006 if (!usecprow) xtmp += bs; 3007 workt = work; 3008 for (j=0; j<n; j++) { 3009 zb = z + bs*(*idx++); 3010 for (k=0; k<bs; k++) zb[k] += workt[k] ; 3011 workt += bs; 3012 } 3013 } 3014 } 3015 #endif 3016 } 3017 PetscCall(VecRestoreArrayRead(xx, &x)); 3018 PetscCall(VecRestoreArray(zz, &z)); 3019 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3020 PetscFunctionReturn(0); 3021 } 3022 3023 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 3024 { 3025 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3026 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 3027 const PetscScalar *x, *xb = NULL; 3028 const MatScalar *v; 3029 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3030 const PetscInt *idx, *ii, *ib, *ridx = NULL; 3031 Mat_CompressedRow cprow = a->compressedrow; 3032 PetscBool usecprow = cprow.use; 3033 3034 PetscFunctionBegin; 3035 if (yy != zz) PetscCall(VecCopy(yy, zz)); 3036 PetscCall(VecGetArrayRead(xx, &x)); 3037 PetscCall(VecGetArray(zz, &z)); 3038 3039 idx = a->j; 3040 v = a->a; 3041 if (usecprow) { 3042 mbs = cprow.nrows; 3043 ii = cprow.i; 3044 ridx = cprow.rindex; 3045 } else { 3046 mbs = a->mbs; 3047 ii = a->i; 3048 xb = x; 3049 } 3050 3051 switch (bs) { 3052 case 1: 3053 for (i = 0; i < mbs; i++) { 3054 if (usecprow) xb = x + ridx[i]; 3055 x1 = xb[0]; 3056 ib = idx + ii[0]; 3057 n = ii[1] - ii[0]; 3058 ii++; 3059 for (j = 0; j < n; j++) { 3060 rval = ib[j]; 3061 z[rval] += *v * x1; 3062 v++; 3063 } 3064 if (!usecprow) xb++; 3065 } 3066 break; 3067 case 2: 3068 for (i = 0; i < mbs; i++) { 3069 if (usecprow) xb = x + 2 * ridx[i]; 3070 x1 = xb[0]; 3071 x2 = xb[1]; 3072 ib = idx + ii[0]; 3073 n = ii[1] - ii[0]; 3074 ii++; 3075 for (j = 0; j < n; j++) { 3076 rval = ib[j] * 2; 3077 z[rval++] += v[0] * x1 + v[1] * x2; 3078 z[rval++] += v[2] * x1 + v[3] * x2; 3079 v += 4; 3080 } 3081 if (!usecprow) xb += 2; 3082 } 3083 break; 3084 case 3: 3085 for (i = 0; i < mbs; i++) { 3086 if (usecprow) xb = x + 3 * ridx[i]; 3087 x1 = xb[0]; 3088 x2 = xb[1]; 3089 x3 = xb[2]; 3090 ib = idx + ii[0]; 3091 n = ii[1] - ii[0]; 3092 ii++; 3093 for (j = 0; j < n; j++) { 3094 rval = ib[j] * 3; 3095 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3096 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3097 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3098 v += 9; 3099 } 3100 if (!usecprow) xb += 3; 3101 } 3102 break; 3103 case 4: 3104 for (i = 0; i < mbs; i++) { 3105 if (usecprow) xb = x + 4 * ridx[i]; 3106 x1 = xb[0]; 3107 x2 = xb[1]; 3108 x3 = xb[2]; 3109 x4 = xb[3]; 3110 ib = idx + ii[0]; 3111 n = ii[1] - ii[0]; 3112 ii++; 3113 for (j = 0; j < n; j++) { 3114 rval = ib[j] * 4; 3115 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3116 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3117 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3118 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3119 v += 16; 3120 } 3121 if (!usecprow) xb += 4; 3122 } 3123 break; 3124 case 5: 3125 for (i = 0; i < mbs; i++) { 3126 if (usecprow) xb = x + 5 * ridx[i]; 3127 x1 = xb[0]; 3128 x2 = xb[1]; 3129 x3 = xb[2]; 3130 x4 = xb[3]; 3131 x5 = xb[4]; 3132 ib = idx + ii[0]; 3133 n = ii[1] - ii[0]; 3134 ii++; 3135 for (j = 0; j < n; j++) { 3136 rval = ib[j] * 5; 3137 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3138 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3139 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3140 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3141 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3142 v += 25; 3143 } 3144 if (!usecprow) xb += 5; 3145 } 3146 break; 3147 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3148 PetscInt ncols, k; 3149 PetscScalar *work, *workt; 3150 const PetscScalar *xtmp; 3151 if (!a->mult_work) { 3152 k = PetscMax(A->rmap->n, A->cmap->n); 3153 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3154 } 3155 work = a->mult_work; 3156 xtmp = x; 3157 for (i = 0; i < mbs; i++) { 3158 n = ii[1] - ii[0]; 3159 ii++; 3160 ncols = n * bs; 3161 PetscCall(PetscArrayzero(work, ncols)); 3162 if (usecprow) xtmp = x + bs * ridx[i]; 3163 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3164 v += n * bs2; 3165 if (!usecprow) xtmp += bs; 3166 workt = work; 3167 for (j = 0; j < n; j++) { 3168 zb = z + bs * (*idx++); 3169 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3170 workt += bs; 3171 } 3172 } 3173 } 3174 } 3175 PetscCall(VecRestoreArrayRead(xx, &x)); 3176 PetscCall(VecRestoreArray(zz, &z)); 3177 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3178 PetscFunctionReturn(0); 3179 } 3180 3181 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3182 { 3183 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3184 PetscInt totalnz = a->bs2 * a->nz; 3185 PetscScalar oalpha = alpha; 3186 PetscBLASInt one = 1, tnz; 3187 3188 PetscFunctionBegin; 3189 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3190 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3191 PetscCall(PetscLogFlops(totalnz)); 3192 PetscFunctionReturn(0); 3193 } 3194 3195 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3196 { 3197 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3198 MatScalar *v = a->a; 3199 PetscReal sum = 0.0; 3200 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3201 3202 PetscFunctionBegin; 3203 if (type == NORM_FROBENIUS) { 3204 #if defined(PETSC_USE_REAL___FP16) 3205 PetscBLASInt one = 1, cnt = bs2 * nz; 3206 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3207 #else 3208 for (i = 0; i < bs2 * nz; i++) { 3209 sum += PetscRealPart(PetscConj(*v) * (*v)); 3210 v++; 3211 } 3212 #endif 3213 *norm = PetscSqrtReal(sum); 3214 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3215 } else if (type == NORM_1) { /* maximum column sum */ 3216 PetscReal *tmp; 3217 PetscInt *bcol = a->j; 3218 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3219 for (i = 0; i < nz; i++) { 3220 for (j = 0; j < bs; j++) { 3221 k1 = bs * (*bcol) + j; /* column index */ 3222 for (k = 0; k < bs; k++) { 3223 tmp[k1] += PetscAbsScalar(*v); 3224 v++; 3225 } 3226 } 3227 bcol++; 3228 } 3229 *norm = 0.0; 3230 for (j = 0; j < A->cmap->n; j++) { 3231 if (tmp[j] > *norm) *norm = tmp[j]; 3232 } 3233 PetscCall(PetscFree(tmp)); 3234 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3235 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3236 *norm = 0.0; 3237 for (k = 0; k < bs; k++) { 3238 for (j = 0; j < a->mbs; j++) { 3239 v = a->a + bs2 * a->i[j] + k; 3240 sum = 0.0; 3241 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3242 for (k1 = 0; k1 < bs; k1++) { 3243 sum += PetscAbsScalar(*v); 3244 v += bs; 3245 } 3246 } 3247 if (sum > *norm) *norm = sum; 3248 } 3249 } 3250 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3251 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3252 PetscFunctionReturn(0); 3253 } 3254 3255 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3256 { 3257 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3258 3259 PetscFunctionBegin; 3260 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3261 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3262 *flg = PETSC_FALSE; 3263 PetscFunctionReturn(0); 3264 } 3265 3266 /* if the a->i are the same */ 3267 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3268 if (!*flg) PetscFunctionReturn(0); 3269 3270 /* if a->j are the same */ 3271 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3272 if (!*flg) PetscFunctionReturn(0); 3273 3274 /* if a->a are the same */ 3275 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3276 PetscFunctionReturn(0); 3277 } 3278 3279 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3280 { 3281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3282 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3283 PetscScalar *x, zero = 0.0; 3284 MatScalar *aa, *aa_j; 3285 3286 PetscFunctionBegin; 3287 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3288 bs = A->rmap->bs; 3289 aa = a->a; 3290 ai = a->i; 3291 aj = a->j; 3292 ambs = a->mbs; 3293 bs2 = a->bs2; 3294 3295 PetscCall(VecSet(v, zero)); 3296 PetscCall(VecGetArray(v, &x)); 3297 PetscCall(VecGetLocalSize(v, &n)); 3298 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3299 for (i = 0; i < ambs; i++) { 3300 for (j = ai[i]; j < ai[i + 1]; j++) { 3301 if (aj[j] == i) { 3302 row = i * bs; 3303 aa_j = aa + j * bs2; 3304 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3305 break; 3306 } 3307 } 3308 } 3309 PetscCall(VecRestoreArray(v, &x)); 3310 PetscFunctionReturn(0); 3311 } 3312 3313 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3314 { 3315 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3316 const PetscScalar *l, *r, *li, *ri; 3317 PetscScalar x; 3318 MatScalar *aa, *v; 3319 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3320 const PetscInt *ai, *aj; 3321 3322 PetscFunctionBegin; 3323 ai = a->i; 3324 aj = a->j; 3325 aa = a->a; 3326 m = A->rmap->n; 3327 n = A->cmap->n; 3328 bs = A->rmap->bs; 3329 mbs = a->mbs; 3330 bs2 = a->bs2; 3331 if (ll) { 3332 PetscCall(VecGetArrayRead(ll, &l)); 3333 PetscCall(VecGetLocalSize(ll, &lm)); 3334 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3335 for (i = 0; i < mbs; i++) { /* for each block row */ 3336 M = ai[i + 1] - ai[i]; 3337 li = l + i * bs; 3338 v = aa + bs2 * ai[i]; 3339 for (j = 0; j < M; j++) { /* for each block */ 3340 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 3341 } 3342 } 3343 PetscCall(VecRestoreArrayRead(ll, &l)); 3344 PetscCall(PetscLogFlops(a->nz)); 3345 } 3346 3347 if (rr) { 3348 PetscCall(VecGetArrayRead(rr, &r)); 3349 PetscCall(VecGetLocalSize(rr, &rn)); 3350 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3351 for (i = 0; i < mbs; i++) { /* for each block row */ 3352 iai = ai[i]; 3353 M = ai[i + 1] - iai; 3354 v = aa + bs2 * iai; 3355 for (j = 0; j < M; j++) { /* for each block */ 3356 ri = r + bs * aj[iai + j]; 3357 for (k = 0; k < bs; k++) { 3358 x = ri[k]; 3359 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3360 v += bs; 3361 } 3362 } 3363 } 3364 PetscCall(VecRestoreArrayRead(rr, &r)); 3365 PetscCall(PetscLogFlops(a->nz)); 3366 } 3367 PetscFunctionReturn(0); 3368 } 3369 3370 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3371 { 3372 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3373 3374 PetscFunctionBegin; 3375 info->block_size = a->bs2; 3376 info->nz_allocated = a->bs2 * a->maxnz; 3377 info->nz_used = a->bs2 * a->nz; 3378 info->nz_unneeded = info->nz_allocated - info->nz_used; 3379 info->assemblies = A->num_ass; 3380 info->mallocs = A->info.mallocs; 3381 info->memory = 0; /* REVIEW ME */ 3382 if (A->factortype) { 3383 info->fill_ratio_given = A->info.fill_ratio_given; 3384 info->fill_ratio_needed = A->info.fill_ratio_needed; 3385 info->factor_mallocs = A->info.factor_mallocs; 3386 } else { 3387 info->fill_ratio_given = 0; 3388 info->fill_ratio_needed = 0; 3389 info->factor_mallocs = 0; 3390 } 3391 PetscFunctionReturn(0); 3392 } 3393 3394 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3395 { 3396 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3397 3398 PetscFunctionBegin; 3399 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3400 PetscFunctionReturn(0); 3401 } 3402 3403 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3404 { 3405 PetscFunctionBegin; 3406 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3407 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3408 PetscFunctionReturn(0); 3409 } 3410 3411 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3412 { 3413 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3414 PetscScalar *z = NULL, sum1; 3415 const PetscScalar *xb; 3416 PetscScalar x1; 3417 const MatScalar *v, *vv; 3418 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3419 PetscBool usecprow = a->compressedrow.use; 3420 3421 PetscFunctionBegin; 3422 idx = a->j; 3423 v = a->a; 3424 if (usecprow) { 3425 mbs = a->compressedrow.nrows; 3426 ii = a->compressedrow.i; 3427 ridx = a->compressedrow.rindex; 3428 } else { 3429 mbs = a->mbs; 3430 ii = a->i; 3431 z = c; 3432 } 3433 3434 for (i = 0; i < mbs; i++) { 3435 n = ii[1] - ii[0]; 3436 ii++; 3437 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3438 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3439 if (usecprow) z = c + ridx[i]; 3440 jj = idx; 3441 vv = v; 3442 for (k = 0; k < cn; k++) { 3443 idx = jj; 3444 v = vv; 3445 sum1 = 0.0; 3446 for (j = 0; j < n; j++) { 3447 xb = b + (*idx++); 3448 x1 = xb[0 + k * bm]; 3449 sum1 += v[0] * x1; 3450 v += 1; 3451 } 3452 z[0 + k * cm] = sum1; 3453 } 3454 if (!usecprow) z += 1; 3455 } 3456 PetscFunctionReturn(0); 3457 } 3458 3459 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3460 { 3461 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3462 PetscScalar *z = NULL, sum1, sum2; 3463 const PetscScalar *xb; 3464 PetscScalar x1, x2; 3465 const MatScalar *v, *vv; 3466 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3467 PetscBool usecprow = a->compressedrow.use; 3468 3469 PetscFunctionBegin; 3470 idx = a->j; 3471 v = a->a; 3472 if (usecprow) { 3473 mbs = a->compressedrow.nrows; 3474 ii = a->compressedrow.i; 3475 ridx = a->compressedrow.rindex; 3476 } else { 3477 mbs = a->mbs; 3478 ii = a->i; 3479 z = c; 3480 } 3481 3482 for (i = 0; i < mbs; i++) { 3483 n = ii[1] - ii[0]; 3484 ii++; 3485 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3486 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3487 if (usecprow) z = c + 2 * ridx[i]; 3488 jj = idx; 3489 vv = v; 3490 for (k = 0; k < cn; k++) { 3491 idx = jj; 3492 v = vv; 3493 sum1 = 0.0; 3494 sum2 = 0.0; 3495 for (j = 0; j < n; j++) { 3496 xb = b + 2 * (*idx++); 3497 x1 = xb[0 + k * bm]; 3498 x2 = xb[1 + k * bm]; 3499 sum1 += v[0] * x1 + v[2] * x2; 3500 sum2 += v[1] * x1 + v[3] * x2; 3501 v += 4; 3502 } 3503 z[0 + k * cm] = sum1; 3504 z[1 + k * cm] = sum2; 3505 } 3506 if (!usecprow) z += 2; 3507 } 3508 PetscFunctionReturn(0); 3509 } 3510 3511 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3512 { 3513 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3514 PetscScalar *z = NULL, sum1, sum2, sum3; 3515 const PetscScalar *xb; 3516 PetscScalar x1, x2, x3; 3517 const MatScalar *v, *vv; 3518 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3519 PetscBool usecprow = a->compressedrow.use; 3520 3521 PetscFunctionBegin; 3522 idx = a->j; 3523 v = a->a; 3524 if (usecprow) { 3525 mbs = a->compressedrow.nrows; 3526 ii = a->compressedrow.i; 3527 ridx = a->compressedrow.rindex; 3528 } else { 3529 mbs = a->mbs; 3530 ii = a->i; 3531 z = c; 3532 } 3533 3534 for (i = 0; i < mbs; i++) { 3535 n = ii[1] - ii[0]; 3536 ii++; 3537 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3538 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3539 if (usecprow) z = c + 3 * ridx[i]; 3540 jj = idx; 3541 vv = v; 3542 for (k = 0; k < cn; k++) { 3543 idx = jj; 3544 v = vv; 3545 sum1 = 0.0; 3546 sum2 = 0.0; 3547 sum3 = 0.0; 3548 for (j = 0; j < n; j++) { 3549 xb = b + 3 * (*idx++); 3550 x1 = xb[0 + k * bm]; 3551 x2 = xb[1 + k * bm]; 3552 x3 = xb[2 + k * bm]; 3553 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3554 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3555 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3556 v += 9; 3557 } 3558 z[0 + k * cm] = sum1; 3559 z[1 + k * cm] = sum2; 3560 z[2 + k * cm] = sum3; 3561 } 3562 if (!usecprow) z += 3; 3563 } 3564 PetscFunctionReturn(0); 3565 } 3566 3567 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3568 { 3569 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3570 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3571 const PetscScalar *xb; 3572 PetscScalar x1, x2, x3, x4; 3573 const MatScalar *v, *vv; 3574 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3575 PetscBool usecprow = a->compressedrow.use; 3576 3577 PetscFunctionBegin; 3578 idx = a->j; 3579 v = a->a; 3580 if (usecprow) { 3581 mbs = a->compressedrow.nrows; 3582 ii = a->compressedrow.i; 3583 ridx = a->compressedrow.rindex; 3584 } else { 3585 mbs = a->mbs; 3586 ii = a->i; 3587 z = c; 3588 } 3589 3590 for (i = 0; i < mbs; i++) { 3591 n = ii[1] - ii[0]; 3592 ii++; 3593 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3594 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3595 if (usecprow) z = c + 4 * ridx[i]; 3596 jj = idx; 3597 vv = v; 3598 for (k = 0; k < cn; k++) { 3599 idx = jj; 3600 v = vv; 3601 sum1 = 0.0; 3602 sum2 = 0.0; 3603 sum3 = 0.0; 3604 sum4 = 0.0; 3605 for (j = 0; j < n; j++) { 3606 xb = b + 4 * (*idx++); 3607 x1 = xb[0 + k * bm]; 3608 x2 = xb[1 + k * bm]; 3609 x3 = xb[2 + k * bm]; 3610 x4 = xb[3 + k * bm]; 3611 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3612 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3613 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3614 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3615 v += 16; 3616 } 3617 z[0 + k * cm] = sum1; 3618 z[1 + k * cm] = sum2; 3619 z[2 + k * cm] = sum3; 3620 z[3 + k * cm] = sum4; 3621 } 3622 if (!usecprow) z += 4; 3623 } 3624 PetscFunctionReturn(0); 3625 } 3626 3627 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3628 { 3629 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3630 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3631 const PetscScalar *xb; 3632 PetscScalar x1, x2, x3, x4, x5; 3633 const MatScalar *v, *vv; 3634 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3635 PetscBool usecprow = a->compressedrow.use; 3636 3637 PetscFunctionBegin; 3638 idx = a->j; 3639 v = a->a; 3640 if (usecprow) { 3641 mbs = a->compressedrow.nrows; 3642 ii = a->compressedrow.i; 3643 ridx = a->compressedrow.rindex; 3644 } else { 3645 mbs = a->mbs; 3646 ii = a->i; 3647 z = c; 3648 } 3649 3650 for (i = 0; i < mbs; i++) { 3651 n = ii[1] - ii[0]; 3652 ii++; 3653 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3654 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3655 if (usecprow) z = c + 5 * ridx[i]; 3656 jj = idx; 3657 vv = v; 3658 for (k = 0; k < cn; k++) { 3659 idx = jj; 3660 v = vv; 3661 sum1 = 0.0; 3662 sum2 = 0.0; 3663 sum3 = 0.0; 3664 sum4 = 0.0; 3665 sum5 = 0.0; 3666 for (j = 0; j < n; j++) { 3667 xb = b + 5 * (*idx++); 3668 x1 = xb[0 + k * bm]; 3669 x2 = xb[1 + k * bm]; 3670 x3 = xb[2 + k * bm]; 3671 x4 = xb[3 + k * bm]; 3672 x5 = xb[4 + k * bm]; 3673 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3674 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3675 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3676 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3677 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3678 v += 25; 3679 } 3680 z[0 + k * cm] = sum1; 3681 z[1 + k * cm] = sum2; 3682 z[2 + k * cm] = sum3; 3683 z[3 + k * cm] = sum4; 3684 z[4 + k * cm] = sum5; 3685 } 3686 if (!usecprow) z += 5; 3687 } 3688 PetscFunctionReturn(0); 3689 } 3690 3691 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3692 { 3693 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3694 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3695 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3696 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3697 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3698 PetscBLASInt bbs, bcn, bbm, bcm; 3699 PetscScalar *z = NULL; 3700 PetscScalar *c, *b; 3701 const MatScalar *v; 3702 const PetscInt *idx, *ii, *ridx = NULL; 3703 PetscScalar _DZero = 0.0, _DOne = 1.0; 3704 PetscBool usecprow = a->compressedrow.use; 3705 3706 PetscFunctionBegin; 3707 if (!cm || !cn) PetscFunctionReturn(0); 3708 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3709 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3710 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3711 b = bd->v; 3712 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 3713 PetscCall(MatDenseGetArray(C, &c)); 3714 switch (bs) { 3715 case 1: 3716 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3717 break; 3718 case 2: 3719 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3720 break; 3721 case 3: 3722 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3723 break; 3724 case 4: 3725 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3726 break; 3727 case 5: 3728 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3729 break; 3730 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3731 PetscCall(PetscBLASIntCast(bs, &bbs)); 3732 PetscCall(PetscBLASIntCast(cn, &bcn)); 3733 PetscCall(PetscBLASIntCast(bm, &bbm)); 3734 PetscCall(PetscBLASIntCast(cm, &bcm)); 3735 idx = a->j; 3736 v = a->a; 3737 if (usecprow) { 3738 mbs = a->compressedrow.nrows; 3739 ii = a->compressedrow.i; 3740 ridx = a->compressedrow.rindex; 3741 } else { 3742 mbs = a->mbs; 3743 ii = a->i; 3744 z = c; 3745 } 3746 for (i = 0; i < mbs; i++) { 3747 n = ii[1] - ii[0]; 3748 ii++; 3749 if (usecprow) z = c + bs * ridx[i]; 3750 if (n) { 3751 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3752 v += bs2; 3753 } 3754 for (j = 1; j < n; j++) { 3755 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3756 v += bs2; 3757 } 3758 if (!usecprow) z += bs; 3759 } 3760 } 3761 PetscCall(MatDenseRestoreArray(C, &c)); 3762 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3763 PetscFunctionReturn(0); 3764 } 3765