1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 15 const PetscInt *idx; 16 PetscInt start, end, *ai, *aj, bs; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m, &table)); 28 PetscCall(PetscMalloc1(m + 1, &nidx)); 29 30 for (i = 0; i < is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 PetscCall(PetscBTMemzero(m, table)); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 PetscCall(ISGetIndices(is[i], &idx)); 37 PetscCall(ISGetLocalSize(is[i], &n)); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j = 0; j < n; ++j) { 41 ival = idx[j] / bs; /* convert the indices into block indices */ 42 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 43 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44 } 45 PetscCall(ISRestoreIndices(is[i], &idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j = 0; j < ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row + 1]; 55 for (l = start; l < end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58 } 59 } 60 } 61 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 62 } 63 PetscCall(PetscBTDestroy(&table)); 64 PetscCall(PetscFree(nidx)); 65 PetscFunctionReturn(PETSC_SUCCESS); 66 } 67 68 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 69 { 70 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 71 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 72 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 73 const PetscInt *irow, *icol; 74 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 75 PetscInt *aj = a->j, *ai = a->i; 76 MatScalar *mat_a; 77 Mat C; 78 PetscBool flag; 79 80 PetscFunctionBegin; 81 PetscCall(ISGetIndices(isrow, &irow)); 82 PetscCall(ISGetIndices(iscol, &icol)); 83 PetscCall(ISGetLocalSize(isrow, &nrows)); 84 PetscCall(ISGetLocalSize(iscol, &ncols)); 85 86 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 87 ssmap = smap; 88 PetscCall(PetscMalloc1(1 + nrows, &lens)); 89 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 90 /* determine lens of each row */ 91 for (i = 0; i < nrows; i++) { 92 kstart = ai[irow[i]]; 93 kend = kstart + a->ilen[irow[i]]; 94 lens[i] = 0; 95 for (k = kstart; k < kend; k++) { 96 if (ssmap[aj[k]]) lens[i]++; 97 } 98 } 99 /* Create and fill new matrix */ 100 if (scall == MAT_REUSE_MATRIX) { 101 c = (Mat_SeqBAIJ *)((*B)->data); 102 103 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 104 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 105 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 106 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 107 C = *B; 108 } else { 109 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 110 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 111 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 112 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 113 } 114 c = (Mat_SeqBAIJ *)(C->data); 115 for (i = 0; i < nrows; i++) { 116 row = irow[i]; 117 kstart = ai[row]; 118 kend = kstart + a->ilen[row]; 119 mat_i = c->i[i]; 120 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 121 mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 122 mat_ilen = c->ilen + i; 123 for (k = kstart; k < kend; k++) { 124 if ((tcol = ssmap[a->j[k]])) { 125 *mat_j++ = tcol - 1; 126 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 127 mat_a += bs2; 128 (*mat_ilen)++; 129 } 130 } 131 } 132 /* sort */ 133 if (c->j && c->a) { 134 MatScalar *work; 135 PetscCall(PetscMalloc1(bs2, &work)); 136 for (i = 0; i < nrows; i++) { 137 PetscInt ilen; 138 mat_i = c->i[i]; 139 mat_j = c->j + mat_i; 140 mat_a = c->a + mat_i * bs2; 141 ilen = c->ilen[i]; 142 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 143 } 144 PetscCall(PetscFree(work)); 145 } 146 147 /* Free work space */ 148 PetscCall(ISRestoreIndices(iscol, &icol)); 149 PetscCall(PetscFree(smap)); 150 PetscCall(PetscFree(lens)); 151 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 152 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 153 154 PetscCall(ISRestoreIndices(isrow, &irow)); 155 *B = C; 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 160 { 161 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 162 IS is1, is2; 163 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 164 const PetscInt *irow, *icol; 165 166 PetscFunctionBegin; 167 PetscCall(ISGetIndices(isrow, &irow)); 168 PetscCall(ISGetIndices(iscol, &icol)); 169 PetscCall(ISGetLocalSize(isrow, &nrows)); 170 PetscCall(ISGetLocalSize(iscol, &ncols)); 171 172 /* Verify if the indices correspond to each element in a block 173 and form the IS with compressed IS */ 174 maxmnbs = PetscMax(a->mbs, a->nbs); 175 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 176 PetscCall(PetscArrayzero(vary, a->mbs)); 177 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 178 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 179 count = 0; 180 for (i = 0; i < nrows; i++) { 181 j = irow[i] / bs; 182 if ((vary[j]--) == bs) iary[count++] = j; 183 } 184 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 185 186 PetscCall(PetscArrayzero(vary, a->nbs)); 187 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 188 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 189 count = 0; 190 for (i = 0; i < ncols; i++) { 191 j = icol[i] / bs; 192 if ((vary[j]--) == bs) iary[count++] = j; 193 } 194 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 195 PetscCall(ISRestoreIndices(isrow, &irow)); 196 PetscCall(ISRestoreIndices(iscol, &icol)); 197 PetscCall(PetscFree2(vary, iary)); 198 199 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 200 PetscCall(ISDestroy(&is1)); 201 PetscCall(ISDestroy(&is2)); 202 PetscFunctionReturn(PETSC_SUCCESS); 203 } 204 205 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 206 { 207 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 208 Mat_SubSppt *submatj = c->submatis1; 209 210 PetscFunctionBegin; 211 PetscCall((*submatj->destroy)(C)); 212 PetscCall(MatDestroySubMatrix_Private(submatj)); 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 217 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 218 { 219 PetscInt i; 220 Mat C; 221 Mat_SeqBAIJ *c; 222 Mat_SubSppt *submatj; 223 224 PetscFunctionBegin; 225 for (i = 0; i < n; i++) { 226 C = (*mat)[i]; 227 c = (Mat_SeqBAIJ *)C->data; 228 submatj = c->submatis1; 229 if (submatj) { 230 if (--((PetscObject)C)->refct <= 0) { 231 PetscCall(PetscFree(C->factorprefix)); 232 PetscCall((*submatj->destroy)(C)); 233 PetscCall(MatDestroySubMatrix_Private(submatj)); 234 PetscCall(PetscFree(C->defaultvectype)); 235 PetscCall(PetscFree(C->defaultrandtype)); 236 PetscCall(PetscLayoutDestroy(&C->rmap)); 237 PetscCall(PetscLayoutDestroy(&C->cmap)); 238 PetscCall(PetscHeaderDestroy(&C)); 239 } 240 } else { 241 PetscCall(MatDestroy(&C)); 242 } 243 } 244 245 /* Destroy Dummy submatrices created for reuse */ 246 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 247 248 PetscCall(PetscFree(*mat)); 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 253 { 254 PetscInt i; 255 256 PetscFunctionBegin; 257 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 258 259 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 260 PetscFunctionReturn(PETSC_SUCCESS); 261 } 262 263 /* -------------------------------------------------------*/ 264 /* Should check that shapes of vectors and matrices match */ 265 /* -------------------------------------------------------*/ 266 267 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 268 { 269 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 270 PetscScalar *z, sum; 271 const PetscScalar *x; 272 const MatScalar *v; 273 PetscInt mbs, i, n; 274 const PetscInt *idx, *ii, *ridx = NULL; 275 PetscBool usecprow = a->compressedrow.use; 276 277 PetscFunctionBegin; 278 PetscCall(VecGetArrayRead(xx, &x)); 279 PetscCall(VecGetArrayWrite(zz, &z)); 280 281 if (usecprow) { 282 mbs = a->compressedrow.nrows; 283 ii = a->compressedrow.i; 284 ridx = a->compressedrow.rindex; 285 PetscCall(PetscArrayzero(z, a->mbs)); 286 } else { 287 mbs = a->mbs; 288 ii = a->i; 289 } 290 291 for (i = 0; i < mbs; i++) { 292 n = ii[1] - ii[0]; 293 v = a->a + ii[0]; 294 idx = a->j + ii[0]; 295 ii++; 296 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 297 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 298 sum = 0.0; 299 PetscSparseDensePlusDot(sum, x, v, idx, n); 300 if (usecprow) { 301 z[ridx[i]] = sum; 302 } else { 303 z[i] = sum; 304 } 305 } 306 PetscCall(VecRestoreArrayRead(xx, &x)); 307 PetscCall(VecRestoreArrayWrite(zz, &z)); 308 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 309 PetscFunctionReturn(PETSC_SUCCESS); 310 } 311 312 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 313 { 314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 315 PetscScalar *z = NULL, sum1, sum2, *zarray; 316 const PetscScalar *x, *xb; 317 PetscScalar x1, x2; 318 const MatScalar *v; 319 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 320 PetscBool usecprow = a->compressedrow.use; 321 322 PetscFunctionBegin; 323 PetscCall(VecGetArrayRead(xx, &x)); 324 PetscCall(VecGetArrayWrite(zz, &zarray)); 325 326 idx = a->j; 327 v = a->a; 328 if (usecprow) { 329 mbs = a->compressedrow.nrows; 330 ii = a->compressedrow.i; 331 ridx = a->compressedrow.rindex; 332 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 333 } else { 334 mbs = a->mbs; 335 ii = a->i; 336 z = zarray; 337 } 338 339 for (i = 0; i < mbs; i++) { 340 n = ii[1] - ii[0]; 341 ii++; 342 sum1 = 0.0; 343 sum2 = 0.0; 344 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 345 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 346 for (j = 0; j < n; j++) { 347 xb = x + 2 * (*idx++); 348 x1 = xb[0]; 349 x2 = xb[1]; 350 sum1 += v[0] * x1 + v[2] * x2; 351 sum2 += v[1] * x1 + v[3] * x2; 352 v += 4; 353 } 354 if (usecprow) z = zarray + 2 * ridx[i]; 355 z[0] = sum1; 356 z[1] = sum2; 357 if (!usecprow) z += 2; 358 } 359 PetscCall(VecRestoreArrayRead(xx, &x)); 360 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 361 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 362 PetscFunctionReturn(PETSC_SUCCESS); 363 } 364 365 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 366 { 367 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 368 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 369 const PetscScalar *x, *xb; 370 const MatScalar *v; 371 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 372 PetscBool usecprow = a->compressedrow.use; 373 374 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 375 #pragma disjoint(*v, *z, *xb) 376 #endif 377 378 PetscFunctionBegin; 379 PetscCall(VecGetArrayRead(xx, &x)); 380 PetscCall(VecGetArrayWrite(zz, &zarray)); 381 382 idx = a->j; 383 v = a->a; 384 if (usecprow) { 385 mbs = a->compressedrow.nrows; 386 ii = a->compressedrow.i; 387 ridx = a->compressedrow.rindex; 388 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 389 } else { 390 mbs = a->mbs; 391 ii = a->i; 392 z = zarray; 393 } 394 395 for (i = 0; i < mbs; i++) { 396 n = ii[1] - ii[0]; 397 ii++; 398 sum1 = 0.0; 399 sum2 = 0.0; 400 sum3 = 0.0; 401 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 402 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 403 for (j = 0; j < n; j++) { 404 xb = x + 3 * (*idx++); 405 x1 = xb[0]; 406 x2 = xb[1]; 407 x3 = xb[2]; 408 409 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 410 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 411 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 412 v += 9; 413 } 414 if (usecprow) z = zarray + 3 * ridx[i]; 415 z[0] = sum1; 416 z[1] = sum2; 417 z[2] = sum3; 418 if (!usecprow) z += 3; 419 } 420 PetscCall(VecRestoreArrayRead(xx, &x)); 421 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 422 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 423 PetscFunctionReturn(PETSC_SUCCESS); 424 } 425 426 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 427 { 428 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 429 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 430 const PetscScalar *x, *xb; 431 const MatScalar *v; 432 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 433 PetscBool usecprow = a->compressedrow.use; 434 435 PetscFunctionBegin; 436 PetscCall(VecGetArrayRead(xx, &x)); 437 PetscCall(VecGetArrayWrite(zz, &zarray)); 438 439 idx = a->j; 440 v = a->a; 441 if (usecprow) { 442 mbs = a->compressedrow.nrows; 443 ii = a->compressedrow.i; 444 ridx = a->compressedrow.rindex; 445 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 446 } else { 447 mbs = a->mbs; 448 ii = a->i; 449 z = zarray; 450 } 451 452 for (i = 0; i < mbs; i++) { 453 n = ii[1] - ii[0]; 454 ii++; 455 sum1 = 0.0; 456 sum2 = 0.0; 457 sum3 = 0.0; 458 sum4 = 0.0; 459 460 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 461 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 462 for (j = 0; j < n; j++) { 463 xb = x + 4 * (*idx++); 464 x1 = xb[0]; 465 x2 = xb[1]; 466 x3 = xb[2]; 467 x4 = xb[3]; 468 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 469 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 470 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 471 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 472 v += 16; 473 } 474 if (usecprow) z = zarray + 4 * ridx[i]; 475 z[0] = sum1; 476 z[1] = sum2; 477 z[2] = sum3; 478 z[3] = sum4; 479 if (!usecprow) z += 4; 480 } 481 PetscCall(VecRestoreArrayRead(xx, &x)); 482 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 483 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 484 PetscFunctionReturn(PETSC_SUCCESS); 485 } 486 487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 490 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 491 const PetscScalar *xb, *x; 492 const MatScalar *v; 493 const PetscInt *idx, *ii, *ridx = NULL; 494 PetscInt mbs, i, j, n; 495 PetscBool usecprow = a->compressedrow.use; 496 497 PetscFunctionBegin; 498 PetscCall(VecGetArrayRead(xx, &x)); 499 PetscCall(VecGetArrayWrite(zz, &zarray)); 500 501 idx = a->j; 502 v = a->a; 503 if (usecprow) { 504 mbs = a->compressedrow.nrows; 505 ii = a->compressedrow.i; 506 ridx = a->compressedrow.rindex; 507 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 508 } else { 509 mbs = a->mbs; 510 ii = a->i; 511 z = zarray; 512 } 513 514 for (i = 0; i < mbs; i++) { 515 n = ii[1] - ii[0]; 516 ii++; 517 sum1 = 0.0; 518 sum2 = 0.0; 519 sum3 = 0.0; 520 sum4 = 0.0; 521 sum5 = 0.0; 522 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 523 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 524 for (j = 0; j < n; j++) { 525 xb = x + 5 * (*idx++); 526 x1 = xb[0]; 527 x2 = xb[1]; 528 x3 = xb[2]; 529 x4 = xb[3]; 530 x5 = xb[4]; 531 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 532 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 533 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 534 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 535 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 536 v += 25; 537 } 538 if (usecprow) z = zarray + 5 * ridx[i]; 539 z[0] = sum1; 540 z[1] = sum2; 541 z[2] = sum3; 542 z[3] = sum4; 543 z[4] = sum5; 544 if (!usecprow) z += 5; 545 } 546 PetscCall(VecRestoreArrayRead(xx, &x)); 547 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 548 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 549 PetscFunctionReturn(PETSC_SUCCESS); 550 } 551 552 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 553 { 554 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 555 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 556 const PetscScalar *x, *xb; 557 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 558 const MatScalar *v; 559 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 560 PetscBool usecprow = a->compressedrow.use; 561 562 PetscFunctionBegin; 563 PetscCall(VecGetArrayRead(xx, &x)); 564 PetscCall(VecGetArrayWrite(zz, &zarray)); 565 566 idx = a->j; 567 v = a->a; 568 if (usecprow) { 569 mbs = a->compressedrow.nrows; 570 ii = a->compressedrow.i; 571 ridx = a->compressedrow.rindex; 572 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 573 } else { 574 mbs = a->mbs; 575 ii = a->i; 576 z = zarray; 577 } 578 579 for (i = 0; i < mbs; i++) { 580 n = ii[1] - ii[0]; 581 ii++; 582 sum1 = 0.0; 583 sum2 = 0.0; 584 sum3 = 0.0; 585 sum4 = 0.0; 586 sum5 = 0.0; 587 sum6 = 0.0; 588 589 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 590 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 591 for (j = 0; j < n; j++) { 592 xb = x + 6 * (*idx++); 593 x1 = xb[0]; 594 x2 = xb[1]; 595 x3 = xb[2]; 596 x4 = xb[3]; 597 x5 = xb[4]; 598 x6 = xb[5]; 599 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 600 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 601 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 602 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 603 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 604 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 605 v += 36; 606 } 607 if (usecprow) z = zarray + 6 * ridx[i]; 608 z[0] = sum1; 609 z[1] = sum2; 610 z[2] = sum3; 611 z[3] = sum4; 612 z[4] = sum5; 613 z[5] = sum6; 614 if (!usecprow) z += 6; 615 } 616 617 PetscCall(VecRestoreArrayRead(xx, &x)); 618 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 619 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 620 PetscFunctionReturn(PETSC_SUCCESS); 621 } 622 623 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 624 { 625 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 626 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 627 const PetscScalar *x, *xb; 628 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 629 const MatScalar *v; 630 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 631 PetscBool usecprow = a->compressedrow.use; 632 633 PetscFunctionBegin; 634 PetscCall(VecGetArrayRead(xx, &x)); 635 PetscCall(VecGetArrayWrite(zz, &zarray)); 636 637 idx = a->j; 638 v = a->a; 639 if (usecprow) { 640 mbs = a->compressedrow.nrows; 641 ii = a->compressedrow.i; 642 ridx = a->compressedrow.rindex; 643 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 644 } else { 645 mbs = a->mbs; 646 ii = a->i; 647 z = zarray; 648 } 649 650 for (i = 0; i < mbs; i++) { 651 n = ii[1] - ii[0]; 652 ii++; 653 sum1 = 0.0; 654 sum2 = 0.0; 655 sum3 = 0.0; 656 sum4 = 0.0; 657 sum5 = 0.0; 658 sum6 = 0.0; 659 sum7 = 0.0; 660 661 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 662 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 663 for (j = 0; j < n; j++) { 664 xb = x + 7 * (*idx++); 665 x1 = xb[0]; 666 x2 = xb[1]; 667 x3 = xb[2]; 668 x4 = xb[3]; 669 x5 = xb[4]; 670 x6 = xb[5]; 671 x7 = xb[6]; 672 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 673 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 674 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 675 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 676 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 677 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 678 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 679 v += 49; 680 } 681 if (usecprow) z = zarray + 7 * ridx[i]; 682 z[0] = sum1; 683 z[1] = sum2; 684 z[2] = sum3; 685 z[3] = sum4; 686 z[4] = sum5; 687 z[5] = sum6; 688 z[6] = sum7; 689 if (!usecprow) z += 7; 690 } 691 692 PetscCall(VecRestoreArrayRead(xx, &x)); 693 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 694 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 695 PetscFunctionReturn(PETSC_SUCCESS); 696 } 697 698 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 699 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 700 { 701 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 702 PetscScalar *z = NULL, *work, *workt, *zarray; 703 const PetscScalar *x, *xb; 704 const MatScalar *v; 705 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 706 const PetscInt *idx, *ii, *ridx = NULL; 707 PetscInt k; 708 PetscBool usecprow = a->compressedrow.use; 709 710 __m256d a0, a1, a2, a3, a4, a5; 711 __m256d w0, w1, w2, w3; 712 __m256d z0, z1, z2; 713 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 714 715 PetscFunctionBegin; 716 PetscCall(VecGetArrayRead(xx, &x)); 717 PetscCall(VecGetArrayWrite(zz, &zarray)); 718 719 idx = a->j; 720 v = a->a; 721 if (usecprow) { 722 mbs = a->compressedrow.nrows; 723 ii = a->compressedrow.i; 724 ridx = a->compressedrow.rindex; 725 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 726 } else { 727 mbs = a->mbs; 728 ii = a->i; 729 z = zarray; 730 } 731 732 if (!a->mult_work) { 733 k = PetscMax(A->rmap->n, A->cmap->n); 734 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 735 } 736 737 work = a->mult_work; 738 for (i = 0; i < mbs; i++) { 739 n = ii[1] - ii[0]; 740 ii++; 741 workt = work; 742 for (j = 0; j < n; j++) { 743 xb = x + bs * (*idx++); 744 for (k = 0; k < bs; k++) workt[k] = xb[k]; 745 workt += bs; 746 } 747 if (usecprow) z = zarray + bs * ridx[i]; 748 749 z0 = _mm256_setzero_pd(); 750 z1 = _mm256_setzero_pd(); 751 z2 = _mm256_setzero_pd(); 752 753 for (j = 0; j < n; j++) { 754 /* first column of a */ 755 w0 = _mm256_set1_pd(work[j * 9]); 756 a0 = _mm256_loadu_pd(&v[j * 81]); 757 z0 = _mm256_fmadd_pd(a0, w0, z0); 758 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 759 z1 = _mm256_fmadd_pd(a1, w0, z1); 760 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 761 z2 = _mm256_fmadd_pd(a2, w0, z2); 762 763 /* second column of a */ 764 w1 = _mm256_set1_pd(work[j * 9 + 1]); 765 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 766 z0 = _mm256_fmadd_pd(a0, w1, z0); 767 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 768 z1 = _mm256_fmadd_pd(a1, w1, z1); 769 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 770 z2 = _mm256_fmadd_pd(a2, w1, z2); 771 772 /* third column of a */ 773 w2 = _mm256_set1_pd(work[j * 9 + 2]); 774 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 775 z0 = _mm256_fmadd_pd(a3, w2, z0); 776 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 777 z1 = _mm256_fmadd_pd(a4, w2, z1); 778 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 779 z2 = _mm256_fmadd_pd(a5, w2, z2); 780 781 /* fourth column of a */ 782 w3 = _mm256_set1_pd(work[j * 9 + 3]); 783 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 784 z0 = _mm256_fmadd_pd(a0, w3, z0); 785 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 786 z1 = _mm256_fmadd_pd(a1, w3, z1); 787 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 788 z2 = _mm256_fmadd_pd(a2, w3, z2); 789 790 /* fifth column of a */ 791 w0 = _mm256_set1_pd(work[j * 9 + 4]); 792 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 793 z0 = _mm256_fmadd_pd(a3, w0, z0); 794 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 795 z1 = _mm256_fmadd_pd(a4, w0, z1); 796 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 797 z2 = _mm256_fmadd_pd(a5, w0, z2); 798 799 /* sixth column of a */ 800 w1 = _mm256_set1_pd(work[j * 9 + 5]); 801 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 802 z0 = _mm256_fmadd_pd(a0, w1, z0); 803 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 804 z1 = _mm256_fmadd_pd(a1, w1, z1); 805 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 806 z2 = _mm256_fmadd_pd(a2, w1, z2); 807 808 /* seventh column of a */ 809 w2 = _mm256_set1_pd(work[j * 9 + 6]); 810 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 811 z0 = _mm256_fmadd_pd(a0, w2, z0); 812 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 813 z1 = _mm256_fmadd_pd(a1, w2, z1); 814 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 815 z2 = _mm256_fmadd_pd(a2, w2, z2); 816 817 /* eighth column of a */ 818 w3 = _mm256_set1_pd(work[j * 9 + 7]); 819 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 820 z0 = _mm256_fmadd_pd(a3, w3, z0); 821 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 822 z1 = _mm256_fmadd_pd(a4, w3, z1); 823 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 824 z2 = _mm256_fmadd_pd(a5, w3, z2); 825 826 /* ninth column of a */ 827 w0 = _mm256_set1_pd(work[j * 9 + 8]); 828 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 829 z0 = _mm256_fmadd_pd(a0, w0, z0); 830 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 831 z1 = _mm256_fmadd_pd(a1, w0, z1); 832 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 833 z2 = _mm256_fmadd_pd(a2, w0, z2); 834 } 835 836 _mm256_storeu_pd(&z[0], z0); 837 _mm256_storeu_pd(&z[4], z1); 838 _mm256_maskstore_pd(&z[8], mask1, z2); 839 840 v += n * bs2; 841 if (!usecprow) z += bs; 842 } 843 PetscCall(VecRestoreArrayRead(xx, &x)); 844 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 845 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 846 PetscFunctionReturn(PETSC_SUCCESS); 847 } 848 #endif 849 850 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 851 { 852 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 853 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 854 const PetscScalar *x, *xb; 855 PetscScalar *zarray, xv; 856 const MatScalar *v; 857 const PetscInt *ii, *ij = a->j, *idx; 858 PetscInt mbs, i, j, k, n, *ridx = NULL; 859 PetscBool usecprow = a->compressedrow.use; 860 861 PetscFunctionBegin; 862 PetscCall(VecGetArrayRead(xx, &x)); 863 PetscCall(VecGetArrayWrite(zz, &zarray)); 864 865 v = a->a; 866 if (usecprow) { 867 mbs = a->compressedrow.nrows; 868 ii = a->compressedrow.i; 869 ridx = a->compressedrow.rindex; 870 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 871 } else { 872 mbs = a->mbs; 873 ii = a->i; 874 z = zarray; 875 } 876 877 for (i = 0; i < mbs; i++) { 878 n = ii[i + 1] - ii[i]; 879 idx = ij + ii[i]; 880 sum1 = 0.0; 881 sum2 = 0.0; 882 sum3 = 0.0; 883 sum4 = 0.0; 884 sum5 = 0.0; 885 sum6 = 0.0; 886 sum7 = 0.0; 887 sum8 = 0.0; 888 sum9 = 0.0; 889 sum10 = 0.0; 890 sum11 = 0.0; 891 892 for (j = 0; j < n; j++) { 893 xb = x + 11 * (idx[j]); 894 895 for (k = 0; k < 11; k++) { 896 xv = xb[k]; 897 sum1 += v[0] * xv; 898 sum2 += v[1] * xv; 899 sum3 += v[2] * xv; 900 sum4 += v[3] * xv; 901 sum5 += v[4] * xv; 902 sum6 += v[5] * xv; 903 sum7 += v[6] * xv; 904 sum8 += v[7] * xv; 905 sum9 += v[8] * xv; 906 sum10 += v[9] * xv; 907 sum11 += v[10] * xv; 908 v += 11; 909 } 910 } 911 if (usecprow) z = zarray + 11 * ridx[i]; 912 z[0] = sum1; 913 z[1] = sum2; 914 z[2] = sum3; 915 z[3] = sum4; 916 z[4] = sum5; 917 z[5] = sum6; 918 z[6] = sum7; 919 z[7] = sum8; 920 z[8] = sum9; 921 z[9] = sum10; 922 z[10] = sum11; 923 924 if (!usecprow) z += 11; 925 } 926 927 PetscCall(VecRestoreArrayRead(xx, &x)); 928 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 929 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 930 PetscFunctionReturn(PETSC_SUCCESS); 931 } 932 933 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 934 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 935 { 936 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 937 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 938 const PetscScalar *x, *xb; 939 PetscScalar *zarray, xv; 940 const MatScalar *v; 941 const PetscInt *ii, *ij = a->j, *idx; 942 PetscInt mbs, i, j, k, n, *ridx = NULL; 943 PetscBool usecprow = a->compressedrow.use; 944 945 PetscFunctionBegin; 946 PetscCall(VecGetArrayRead(xx, &x)); 947 PetscCall(VecGetArrayWrite(zz, &zarray)); 948 949 v = a->a; 950 if (usecprow) { 951 mbs = a->compressedrow.nrows; 952 ii = a->compressedrow.i; 953 ridx = a->compressedrow.rindex; 954 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 955 } else { 956 mbs = a->mbs; 957 ii = a->i; 958 z = zarray; 959 } 960 961 for (i = 0; i < mbs; i++) { 962 n = ii[i + 1] - ii[i]; 963 idx = ij + ii[i]; 964 sum1 = 0.0; 965 sum2 = 0.0; 966 sum3 = 0.0; 967 sum4 = 0.0; 968 sum5 = 0.0; 969 sum6 = 0.0; 970 sum7 = 0.0; 971 sum8 = 0.0; 972 sum9 = 0.0; 973 sum10 = 0.0; 974 sum11 = 0.0; 975 sum12 = 0.0; 976 977 for (j = 0; j < n; j++) { 978 xb = x + 12 * (idx[j]); 979 980 for (k = 0; k < 12; k++) { 981 xv = xb[k]; 982 sum1 += v[0] * xv; 983 sum2 += v[1] * xv; 984 sum3 += v[2] * xv; 985 sum4 += v[3] * xv; 986 sum5 += v[4] * xv; 987 sum6 += v[5] * xv; 988 sum7 += v[6] * xv; 989 sum8 += v[7] * xv; 990 sum9 += v[8] * xv; 991 sum10 += v[9] * xv; 992 sum11 += v[10] * xv; 993 sum12 += v[11] * xv; 994 v += 12; 995 } 996 } 997 if (usecprow) z = zarray + 12 * ridx[i]; 998 z[0] = sum1; 999 z[1] = sum2; 1000 z[2] = sum3; 1001 z[3] = sum4; 1002 z[4] = sum5; 1003 z[5] = sum6; 1004 z[6] = sum7; 1005 z[7] = sum8; 1006 z[8] = sum9; 1007 z[9] = sum10; 1008 z[10] = sum11; 1009 z[11] = sum12; 1010 if (!usecprow) z += 12; 1011 } 1012 PetscCall(VecRestoreArrayRead(xx, &x)); 1013 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1014 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1015 PetscFunctionReturn(PETSC_SUCCESS); 1016 } 1017 1018 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 1019 { 1020 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1021 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1022 const PetscScalar *x, *xb; 1023 PetscScalar *zarray, *yarray, xv; 1024 const MatScalar *v; 1025 const PetscInt *ii, *ij = a->j, *idx; 1026 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 1027 PetscBool usecprow = a->compressedrow.use; 1028 1029 PetscFunctionBegin; 1030 PetscCall(VecGetArrayRead(xx, &x)); 1031 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1032 1033 v = a->a; 1034 if (usecprow) { 1035 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1036 mbs = a->compressedrow.nrows; 1037 ii = a->compressedrow.i; 1038 ridx = a->compressedrow.rindex; 1039 } else { 1040 ii = a->i; 1041 y = yarray; 1042 z = zarray; 1043 } 1044 1045 for (i = 0; i < mbs; i++) { 1046 n = ii[i + 1] - ii[i]; 1047 idx = ij + ii[i]; 1048 1049 if (usecprow) { 1050 y = yarray + 12 * ridx[i]; 1051 z = zarray + 12 * ridx[i]; 1052 } 1053 sum1 = y[0]; 1054 sum2 = y[1]; 1055 sum3 = y[2]; 1056 sum4 = y[3]; 1057 sum5 = y[4]; 1058 sum6 = y[5]; 1059 sum7 = y[6]; 1060 sum8 = y[7]; 1061 sum9 = y[8]; 1062 sum10 = y[9]; 1063 sum11 = y[10]; 1064 sum12 = y[11]; 1065 1066 for (j = 0; j < n; j++) { 1067 xb = x + 12 * (idx[j]); 1068 1069 for (k = 0; k < 12; k++) { 1070 xv = xb[k]; 1071 sum1 += v[0] * xv; 1072 sum2 += v[1] * xv; 1073 sum3 += v[2] * xv; 1074 sum4 += v[3] * xv; 1075 sum5 += v[4] * xv; 1076 sum6 += v[5] * xv; 1077 sum7 += v[6] * xv; 1078 sum8 += v[7] * xv; 1079 sum9 += v[8] * xv; 1080 sum10 += v[9] * xv; 1081 sum11 += v[10] * xv; 1082 sum12 += v[11] * xv; 1083 v += 12; 1084 } 1085 } 1086 1087 z[0] = sum1; 1088 z[1] = sum2; 1089 z[2] = sum3; 1090 z[3] = sum4; 1091 z[4] = sum5; 1092 z[5] = sum6; 1093 z[6] = sum7; 1094 z[7] = sum8; 1095 z[8] = sum9; 1096 z[9] = sum10; 1097 z[10] = sum11; 1098 z[11] = sum12; 1099 if (!usecprow) { 1100 y += 12; 1101 z += 12; 1102 } 1103 } 1104 PetscCall(VecRestoreArrayRead(xx, &x)); 1105 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1106 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1107 PetscFunctionReturn(PETSC_SUCCESS); 1108 } 1109 1110 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1111 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1112 { 1113 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1114 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1115 const PetscScalar *x, *xb; 1116 PetscScalar x1, x2, x3, x4, *zarray; 1117 const MatScalar *v; 1118 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1119 PetscInt mbs, i, j, n; 1120 PetscBool usecprow = a->compressedrow.use; 1121 1122 PetscFunctionBegin; 1123 PetscCall(VecGetArrayRead(xx, &x)); 1124 PetscCall(VecGetArrayWrite(zz, &zarray)); 1125 1126 v = a->a; 1127 if (usecprow) { 1128 mbs = a->compressedrow.nrows; 1129 ii = a->compressedrow.i; 1130 ridx = a->compressedrow.rindex; 1131 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1132 } else { 1133 mbs = a->mbs; 1134 ii = a->i; 1135 z = zarray; 1136 } 1137 1138 for (i = 0; i < mbs; i++) { 1139 n = ii[i + 1] - ii[i]; 1140 idx = ij + ii[i]; 1141 1142 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1143 for (j = 0; j < n; j++) { 1144 xb = x + 12 * (idx[j]); 1145 x1 = xb[0]; 1146 x2 = xb[1]; 1147 x3 = xb[2]; 1148 x4 = xb[3]; 1149 1150 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1151 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1152 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1153 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1154 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1155 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1156 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1157 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1158 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1159 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1160 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1161 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1162 v += 48; 1163 1164 x1 = xb[4]; 1165 x2 = xb[5]; 1166 x3 = xb[6]; 1167 x4 = xb[7]; 1168 1169 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1170 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1171 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1172 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1173 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1174 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1175 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1176 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1177 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1178 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1179 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1180 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1181 v += 48; 1182 1183 x1 = xb[8]; 1184 x2 = xb[9]; 1185 x3 = xb[10]; 1186 x4 = xb[11]; 1187 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1188 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1189 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1190 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1191 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1192 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1193 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1194 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1195 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1196 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1197 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1198 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1199 v += 48; 1200 } 1201 if (usecprow) z = zarray + 12 * ridx[i]; 1202 z[0] = sum1; 1203 z[1] = sum2; 1204 z[2] = sum3; 1205 z[3] = sum4; 1206 z[4] = sum5; 1207 z[5] = sum6; 1208 z[6] = sum7; 1209 z[7] = sum8; 1210 z[8] = sum9; 1211 z[9] = sum10; 1212 z[10] = sum11; 1213 z[11] = sum12; 1214 if (!usecprow) z += 12; 1215 } 1216 PetscCall(VecRestoreArrayRead(xx, &x)); 1217 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1218 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1219 PetscFunctionReturn(PETSC_SUCCESS); 1220 } 1221 1222 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1223 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1224 { 1225 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1226 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1227 const PetscScalar *x, *xb; 1228 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1229 const MatScalar *v; 1230 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1231 PetscInt mbs = a->mbs, i, j, n; 1232 PetscBool usecprow = a->compressedrow.use; 1233 1234 PetscFunctionBegin; 1235 PetscCall(VecGetArrayRead(xx, &x)); 1236 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1237 1238 v = a->a; 1239 if (usecprow) { 1240 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1241 mbs = a->compressedrow.nrows; 1242 ii = a->compressedrow.i; 1243 ridx = a->compressedrow.rindex; 1244 } else { 1245 ii = a->i; 1246 y = yarray; 1247 z = zarray; 1248 } 1249 1250 for (i = 0; i < mbs; i++) { 1251 n = ii[i + 1] - ii[i]; 1252 idx = ij + ii[i]; 1253 1254 if (usecprow) { 1255 y = yarray + 12 * ridx[i]; 1256 z = zarray + 12 * ridx[i]; 1257 } 1258 sum1 = y[0]; 1259 sum2 = y[1]; 1260 sum3 = y[2]; 1261 sum4 = y[3]; 1262 sum5 = y[4]; 1263 sum6 = y[5]; 1264 sum7 = y[6]; 1265 sum8 = y[7]; 1266 sum9 = y[8]; 1267 sum10 = y[9]; 1268 sum11 = y[10]; 1269 sum12 = y[11]; 1270 1271 for (j = 0; j < n; j++) { 1272 xb = x + 12 * (idx[j]); 1273 x1 = xb[0]; 1274 x2 = xb[1]; 1275 x3 = xb[2]; 1276 x4 = xb[3]; 1277 1278 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1279 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1280 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1281 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1282 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1283 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1284 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1285 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1286 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1287 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1288 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1289 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1290 v += 48; 1291 1292 x1 = xb[4]; 1293 x2 = xb[5]; 1294 x3 = xb[6]; 1295 x4 = xb[7]; 1296 1297 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1298 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1299 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1300 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1301 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1302 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1303 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1304 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1305 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1306 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1307 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1308 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1309 v += 48; 1310 1311 x1 = xb[8]; 1312 x2 = xb[9]; 1313 x3 = xb[10]; 1314 x4 = xb[11]; 1315 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1316 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1317 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1318 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1319 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1320 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1321 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1322 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1323 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1324 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1325 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1326 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1327 v += 48; 1328 } 1329 z[0] = sum1; 1330 z[1] = sum2; 1331 z[2] = sum3; 1332 z[3] = sum4; 1333 z[4] = sum5; 1334 z[5] = sum6; 1335 z[6] = sum7; 1336 z[7] = sum8; 1337 z[8] = sum9; 1338 z[9] = sum10; 1339 z[10] = sum11; 1340 z[11] = sum12; 1341 if (!usecprow) { 1342 y += 12; 1343 z += 12; 1344 } 1345 } 1346 PetscCall(VecRestoreArrayRead(xx, &x)); 1347 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1348 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1349 PetscFunctionReturn(PETSC_SUCCESS); 1350 } 1351 1352 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1353 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1354 { 1355 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1356 PetscScalar *z = NULL, *zarray; 1357 const PetscScalar *x, *work; 1358 const MatScalar *v = a->a; 1359 PetscInt mbs, i, j, n; 1360 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1361 PetscBool usecprow = a->compressedrow.use; 1362 const PetscInt bs = 12, bs2 = 144; 1363 1364 __m256d a0, a1, a2, a3, a4, a5; 1365 __m256d w0, w1, w2, w3; 1366 __m256d z0, z1, z2; 1367 1368 PetscFunctionBegin; 1369 PetscCall(VecGetArrayRead(xx, &x)); 1370 PetscCall(VecGetArrayWrite(zz, &zarray)); 1371 1372 if (usecprow) { 1373 mbs = a->compressedrow.nrows; 1374 ii = a->compressedrow.i; 1375 ridx = a->compressedrow.rindex; 1376 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1377 } else { 1378 mbs = a->mbs; 1379 ii = a->i; 1380 z = zarray; 1381 } 1382 1383 for (i = 0; i < mbs; i++) { 1384 z0 = _mm256_setzero_pd(); 1385 z1 = _mm256_setzero_pd(); 1386 z2 = _mm256_setzero_pd(); 1387 1388 n = ii[1] - ii[0]; 1389 ii++; 1390 for (j = 0; j < n; j++) { 1391 work = x + bs * (*idx++); 1392 1393 /* first column of a */ 1394 w0 = _mm256_set1_pd(work[0]); 1395 a0 = _mm256_loadu_pd(v + 0); 1396 z0 = _mm256_fmadd_pd(a0, w0, z0); 1397 a1 = _mm256_loadu_pd(v + 4); 1398 z1 = _mm256_fmadd_pd(a1, w0, z1); 1399 a2 = _mm256_loadu_pd(v + 8); 1400 z2 = _mm256_fmadd_pd(a2, w0, z2); 1401 1402 /* second column of a */ 1403 w1 = _mm256_set1_pd(work[1]); 1404 a3 = _mm256_loadu_pd(v + 12); 1405 z0 = _mm256_fmadd_pd(a3, w1, z0); 1406 a4 = _mm256_loadu_pd(v + 16); 1407 z1 = _mm256_fmadd_pd(a4, w1, z1); 1408 a5 = _mm256_loadu_pd(v + 20); 1409 z2 = _mm256_fmadd_pd(a5, w1, z2); 1410 1411 /* third column of a */ 1412 w2 = _mm256_set1_pd(work[2]); 1413 a0 = _mm256_loadu_pd(v + 24); 1414 z0 = _mm256_fmadd_pd(a0, w2, z0); 1415 a1 = _mm256_loadu_pd(v + 28); 1416 z1 = _mm256_fmadd_pd(a1, w2, z1); 1417 a2 = _mm256_loadu_pd(v + 32); 1418 z2 = _mm256_fmadd_pd(a2, w2, z2); 1419 1420 /* fourth column of a */ 1421 w3 = _mm256_set1_pd(work[3]); 1422 a3 = _mm256_loadu_pd(v + 36); 1423 z0 = _mm256_fmadd_pd(a3, w3, z0); 1424 a4 = _mm256_loadu_pd(v + 40); 1425 z1 = _mm256_fmadd_pd(a4, w3, z1); 1426 a5 = _mm256_loadu_pd(v + 44); 1427 z2 = _mm256_fmadd_pd(a5, w3, z2); 1428 1429 /* fifth column of a */ 1430 w0 = _mm256_set1_pd(work[4]); 1431 a0 = _mm256_loadu_pd(v + 48); 1432 z0 = _mm256_fmadd_pd(a0, w0, z0); 1433 a1 = _mm256_loadu_pd(v + 52); 1434 z1 = _mm256_fmadd_pd(a1, w0, z1); 1435 a2 = _mm256_loadu_pd(v + 56); 1436 z2 = _mm256_fmadd_pd(a2, w0, z2); 1437 1438 /* sixth column of a */ 1439 w1 = _mm256_set1_pd(work[5]); 1440 a3 = _mm256_loadu_pd(v + 60); 1441 z0 = _mm256_fmadd_pd(a3, w1, z0); 1442 a4 = _mm256_loadu_pd(v + 64); 1443 z1 = _mm256_fmadd_pd(a4, w1, z1); 1444 a5 = _mm256_loadu_pd(v + 68); 1445 z2 = _mm256_fmadd_pd(a5, w1, z2); 1446 1447 /* seventh column of a */ 1448 w2 = _mm256_set1_pd(work[6]); 1449 a0 = _mm256_loadu_pd(v + 72); 1450 z0 = _mm256_fmadd_pd(a0, w2, z0); 1451 a1 = _mm256_loadu_pd(v + 76); 1452 z1 = _mm256_fmadd_pd(a1, w2, z1); 1453 a2 = _mm256_loadu_pd(v + 80); 1454 z2 = _mm256_fmadd_pd(a2, w2, z2); 1455 1456 /* eighth column of a */ 1457 w3 = _mm256_set1_pd(work[7]); 1458 a3 = _mm256_loadu_pd(v + 84); 1459 z0 = _mm256_fmadd_pd(a3, w3, z0); 1460 a4 = _mm256_loadu_pd(v + 88); 1461 z1 = _mm256_fmadd_pd(a4, w3, z1); 1462 a5 = _mm256_loadu_pd(v + 92); 1463 z2 = _mm256_fmadd_pd(a5, w3, z2); 1464 1465 /* ninth column of a */ 1466 w0 = _mm256_set1_pd(work[8]); 1467 a0 = _mm256_loadu_pd(v + 96); 1468 z0 = _mm256_fmadd_pd(a0, w0, z0); 1469 a1 = _mm256_loadu_pd(v + 100); 1470 z1 = _mm256_fmadd_pd(a1, w0, z1); 1471 a2 = _mm256_loadu_pd(v + 104); 1472 z2 = _mm256_fmadd_pd(a2, w0, z2); 1473 1474 /* tenth column of a */ 1475 w1 = _mm256_set1_pd(work[9]); 1476 a3 = _mm256_loadu_pd(v + 108); 1477 z0 = _mm256_fmadd_pd(a3, w1, z0); 1478 a4 = _mm256_loadu_pd(v + 112); 1479 z1 = _mm256_fmadd_pd(a4, w1, z1); 1480 a5 = _mm256_loadu_pd(v + 116); 1481 z2 = _mm256_fmadd_pd(a5, w1, z2); 1482 1483 /* eleventh column of a */ 1484 w2 = _mm256_set1_pd(work[10]); 1485 a0 = _mm256_loadu_pd(v + 120); 1486 z0 = _mm256_fmadd_pd(a0, w2, z0); 1487 a1 = _mm256_loadu_pd(v + 124); 1488 z1 = _mm256_fmadd_pd(a1, w2, z1); 1489 a2 = _mm256_loadu_pd(v + 128); 1490 z2 = _mm256_fmadd_pd(a2, w2, z2); 1491 1492 /* twelveth column of a */ 1493 w3 = _mm256_set1_pd(work[11]); 1494 a3 = _mm256_loadu_pd(v + 132); 1495 z0 = _mm256_fmadd_pd(a3, w3, z0); 1496 a4 = _mm256_loadu_pd(v + 136); 1497 z1 = _mm256_fmadd_pd(a4, w3, z1); 1498 a5 = _mm256_loadu_pd(v + 140); 1499 z2 = _mm256_fmadd_pd(a5, w3, z2); 1500 1501 v += bs2; 1502 } 1503 if (usecprow) z = zarray + bs * ridx[i]; 1504 _mm256_storeu_pd(&z[0], z0); 1505 _mm256_storeu_pd(&z[4], z1); 1506 _mm256_storeu_pd(&z[8], z2); 1507 if (!usecprow) z += bs; 1508 } 1509 PetscCall(VecRestoreArrayRead(xx, &x)); 1510 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1511 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1512 PetscFunctionReturn(PETSC_SUCCESS); 1513 } 1514 #endif 1515 1516 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1517 /* Default MatMult for block size 15 */ 1518 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1519 { 1520 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1521 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1522 const PetscScalar *x, *xb; 1523 PetscScalar *zarray, xv; 1524 const MatScalar *v; 1525 const PetscInt *ii, *ij = a->j, *idx; 1526 PetscInt mbs, i, j, k, n, *ridx = NULL; 1527 PetscBool usecprow = a->compressedrow.use; 1528 1529 PetscFunctionBegin; 1530 PetscCall(VecGetArrayRead(xx, &x)); 1531 PetscCall(VecGetArrayWrite(zz, &zarray)); 1532 1533 v = a->a; 1534 if (usecprow) { 1535 mbs = a->compressedrow.nrows; 1536 ii = a->compressedrow.i; 1537 ridx = a->compressedrow.rindex; 1538 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1539 } else { 1540 mbs = a->mbs; 1541 ii = a->i; 1542 z = zarray; 1543 } 1544 1545 for (i = 0; i < mbs; i++) { 1546 n = ii[i + 1] - ii[i]; 1547 idx = ij + ii[i]; 1548 sum1 = 0.0; 1549 sum2 = 0.0; 1550 sum3 = 0.0; 1551 sum4 = 0.0; 1552 sum5 = 0.0; 1553 sum6 = 0.0; 1554 sum7 = 0.0; 1555 sum8 = 0.0; 1556 sum9 = 0.0; 1557 sum10 = 0.0; 1558 sum11 = 0.0; 1559 sum12 = 0.0; 1560 sum13 = 0.0; 1561 sum14 = 0.0; 1562 sum15 = 0.0; 1563 1564 for (j = 0; j < n; j++) { 1565 xb = x + 15 * (idx[j]); 1566 1567 for (k = 0; k < 15; k++) { 1568 xv = xb[k]; 1569 sum1 += v[0] * xv; 1570 sum2 += v[1] * xv; 1571 sum3 += v[2] * xv; 1572 sum4 += v[3] * xv; 1573 sum5 += v[4] * xv; 1574 sum6 += v[5] * xv; 1575 sum7 += v[6] * xv; 1576 sum8 += v[7] * xv; 1577 sum9 += v[8] * xv; 1578 sum10 += v[9] * xv; 1579 sum11 += v[10] * xv; 1580 sum12 += v[11] * xv; 1581 sum13 += v[12] * xv; 1582 sum14 += v[13] * xv; 1583 sum15 += v[14] * xv; 1584 v += 15; 1585 } 1586 } 1587 if (usecprow) z = zarray + 15 * ridx[i]; 1588 z[0] = sum1; 1589 z[1] = sum2; 1590 z[2] = sum3; 1591 z[3] = sum4; 1592 z[4] = sum5; 1593 z[5] = sum6; 1594 z[6] = sum7; 1595 z[7] = sum8; 1596 z[8] = sum9; 1597 z[9] = sum10; 1598 z[10] = sum11; 1599 z[11] = sum12; 1600 z[12] = sum13; 1601 z[13] = sum14; 1602 z[14] = sum15; 1603 1604 if (!usecprow) z += 15; 1605 } 1606 1607 PetscCall(VecRestoreArrayRead(xx, &x)); 1608 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1609 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1610 PetscFunctionReturn(PETSC_SUCCESS); 1611 } 1612 1613 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1614 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1615 { 1616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1617 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1618 const PetscScalar *x, *xb; 1619 PetscScalar x1, x2, x3, x4, *zarray; 1620 const MatScalar *v; 1621 const PetscInt *ii, *ij = a->j, *idx; 1622 PetscInt mbs, i, j, n, *ridx = NULL; 1623 PetscBool usecprow = a->compressedrow.use; 1624 1625 PetscFunctionBegin; 1626 PetscCall(VecGetArrayRead(xx, &x)); 1627 PetscCall(VecGetArrayWrite(zz, &zarray)); 1628 1629 v = a->a; 1630 if (usecprow) { 1631 mbs = a->compressedrow.nrows; 1632 ii = a->compressedrow.i; 1633 ridx = a->compressedrow.rindex; 1634 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1635 } else { 1636 mbs = a->mbs; 1637 ii = a->i; 1638 z = zarray; 1639 } 1640 1641 for (i = 0; i < mbs; i++) { 1642 n = ii[i + 1] - ii[i]; 1643 idx = ij + ii[i]; 1644 sum1 = 0.0; 1645 sum2 = 0.0; 1646 sum3 = 0.0; 1647 sum4 = 0.0; 1648 sum5 = 0.0; 1649 sum6 = 0.0; 1650 sum7 = 0.0; 1651 sum8 = 0.0; 1652 sum9 = 0.0; 1653 sum10 = 0.0; 1654 sum11 = 0.0; 1655 sum12 = 0.0; 1656 sum13 = 0.0; 1657 sum14 = 0.0; 1658 sum15 = 0.0; 1659 1660 for (j = 0; j < n; j++) { 1661 xb = x + 15 * (idx[j]); 1662 x1 = xb[0]; 1663 x2 = xb[1]; 1664 x3 = xb[2]; 1665 x4 = xb[3]; 1666 1667 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1668 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1669 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1670 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1671 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1672 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1673 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1674 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1675 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1676 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1677 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1678 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1679 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1680 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1681 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1682 1683 v += 60; 1684 1685 x1 = xb[4]; 1686 x2 = xb[5]; 1687 x3 = xb[6]; 1688 x4 = xb[7]; 1689 1690 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1691 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1692 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1693 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1694 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1695 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1696 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1697 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1698 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1699 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1700 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1701 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1702 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1703 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1704 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1705 v += 60; 1706 1707 x1 = xb[8]; 1708 x2 = xb[9]; 1709 x3 = xb[10]; 1710 x4 = xb[11]; 1711 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1712 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1713 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1714 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1715 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1716 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1717 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1718 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1719 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1720 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1721 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1722 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1723 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1724 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1725 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1726 v += 60; 1727 1728 x1 = xb[12]; 1729 x2 = xb[13]; 1730 x3 = xb[14]; 1731 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1732 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1733 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1734 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1735 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1736 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1737 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1738 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1739 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1740 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1741 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1742 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1743 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1744 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1745 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1746 v += 45; 1747 } 1748 if (usecprow) z = zarray + 15 * ridx[i]; 1749 z[0] = sum1; 1750 z[1] = sum2; 1751 z[2] = sum3; 1752 z[3] = sum4; 1753 z[4] = sum5; 1754 z[5] = sum6; 1755 z[6] = sum7; 1756 z[7] = sum8; 1757 z[8] = sum9; 1758 z[9] = sum10; 1759 z[10] = sum11; 1760 z[11] = sum12; 1761 z[12] = sum13; 1762 z[13] = sum14; 1763 z[14] = sum15; 1764 1765 if (!usecprow) z += 15; 1766 } 1767 1768 PetscCall(VecRestoreArrayRead(xx, &x)); 1769 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1770 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1771 PetscFunctionReturn(PETSC_SUCCESS); 1772 } 1773 1774 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1775 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1776 { 1777 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1778 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1779 const PetscScalar *x, *xb; 1780 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1781 const MatScalar *v; 1782 const PetscInt *ii, *ij = a->j, *idx; 1783 PetscInt mbs, i, j, n, *ridx = NULL; 1784 PetscBool usecprow = a->compressedrow.use; 1785 1786 PetscFunctionBegin; 1787 PetscCall(VecGetArrayRead(xx, &x)); 1788 PetscCall(VecGetArrayWrite(zz, &zarray)); 1789 1790 v = a->a; 1791 if (usecprow) { 1792 mbs = a->compressedrow.nrows; 1793 ii = a->compressedrow.i; 1794 ridx = a->compressedrow.rindex; 1795 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1796 } else { 1797 mbs = a->mbs; 1798 ii = a->i; 1799 z = zarray; 1800 } 1801 1802 for (i = 0; i < mbs; i++) { 1803 n = ii[i + 1] - ii[i]; 1804 idx = ij + ii[i]; 1805 sum1 = 0.0; 1806 sum2 = 0.0; 1807 sum3 = 0.0; 1808 sum4 = 0.0; 1809 sum5 = 0.0; 1810 sum6 = 0.0; 1811 sum7 = 0.0; 1812 sum8 = 0.0; 1813 sum9 = 0.0; 1814 sum10 = 0.0; 1815 sum11 = 0.0; 1816 sum12 = 0.0; 1817 sum13 = 0.0; 1818 sum14 = 0.0; 1819 sum15 = 0.0; 1820 1821 for (j = 0; j < n; j++) { 1822 xb = x + 15 * (idx[j]); 1823 x1 = xb[0]; 1824 x2 = xb[1]; 1825 x3 = xb[2]; 1826 x4 = xb[3]; 1827 x5 = xb[4]; 1828 x6 = xb[5]; 1829 x7 = xb[6]; 1830 x8 = xb[7]; 1831 1832 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1833 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1834 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1835 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1836 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1837 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1838 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1839 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1840 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1841 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1842 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1843 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1844 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1845 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1846 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1847 v += 120; 1848 1849 x1 = xb[8]; 1850 x2 = xb[9]; 1851 x3 = xb[10]; 1852 x4 = xb[11]; 1853 x5 = xb[12]; 1854 x6 = xb[13]; 1855 x7 = xb[14]; 1856 1857 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1858 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1859 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1860 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1861 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1862 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1863 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1864 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1865 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1866 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1867 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1868 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1869 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1870 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1871 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1872 v += 105; 1873 } 1874 if (usecprow) z = zarray + 15 * ridx[i]; 1875 z[0] = sum1; 1876 z[1] = sum2; 1877 z[2] = sum3; 1878 z[3] = sum4; 1879 z[4] = sum5; 1880 z[5] = sum6; 1881 z[6] = sum7; 1882 z[7] = sum8; 1883 z[8] = sum9; 1884 z[9] = sum10; 1885 z[10] = sum11; 1886 z[11] = sum12; 1887 z[12] = sum13; 1888 z[13] = sum14; 1889 z[14] = sum15; 1890 1891 if (!usecprow) z += 15; 1892 } 1893 1894 PetscCall(VecRestoreArrayRead(xx, &x)); 1895 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1896 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1897 PetscFunctionReturn(PETSC_SUCCESS); 1898 } 1899 1900 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1901 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1902 { 1903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1904 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1905 const PetscScalar *x, *xb; 1906 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1907 const MatScalar *v; 1908 const PetscInt *ii, *ij = a->j, *idx; 1909 PetscInt mbs, i, j, n, *ridx = NULL; 1910 PetscBool usecprow = a->compressedrow.use; 1911 1912 PetscFunctionBegin; 1913 PetscCall(VecGetArrayRead(xx, &x)); 1914 PetscCall(VecGetArrayWrite(zz, &zarray)); 1915 1916 v = a->a; 1917 if (usecprow) { 1918 mbs = a->compressedrow.nrows; 1919 ii = a->compressedrow.i; 1920 ridx = a->compressedrow.rindex; 1921 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1922 } else { 1923 mbs = a->mbs; 1924 ii = a->i; 1925 z = zarray; 1926 } 1927 1928 for (i = 0; i < mbs; i++) { 1929 n = ii[i + 1] - ii[i]; 1930 idx = ij + ii[i]; 1931 sum1 = 0.0; 1932 sum2 = 0.0; 1933 sum3 = 0.0; 1934 sum4 = 0.0; 1935 sum5 = 0.0; 1936 sum6 = 0.0; 1937 sum7 = 0.0; 1938 sum8 = 0.0; 1939 sum9 = 0.0; 1940 sum10 = 0.0; 1941 sum11 = 0.0; 1942 sum12 = 0.0; 1943 sum13 = 0.0; 1944 sum14 = 0.0; 1945 sum15 = 0.0; 1946 1947 for (j = 0; j < n; j++) { 1948 xb = x + 15 * (idx[j]); 1949 x1 = xb[0]; 1950 x2 = xb[1]; 1951 x3 = xb[2]; 1952 x4 = xb[3]; 1953 x5 = xb[4]; 1954 x6 = xb[5]; 1955 x7 = xb[6]; 1956 x8 = xb[7]; 1957 x9 = xb[8]; 1958 x10 = xb[9]; 1959 x11 = xb[10]; 1960 x12 = xb[11]; 1961 x13 = xb[12]; 1962 x14 = xb[13]; 1963 x15 = xb[14]; 1964 1965 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1966 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1967 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1968 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1969 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1970 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1971 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1972 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1973 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1974 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1975 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1976 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1977 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1978 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1979 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1980 v += 225; 1981 } 1982 if (usecprow) z = zarray + 15 * ridx[i]; 1983 z[0] = sum1; 1984 z[1] = sum2; 1985 z[2] = sum3; 1986 z[3] = sum4; 1987 z[4] = sum5; 1988 z[5] = sum6; 1989 z[6] = sum7; 1990 z[7] = sum8; 1991 z[8] = sum9; 1992 z[9] = sum10; 1993 z[10] = sum11; 1994 z[11] = sum12; 1995 z[12] = sum13; 1996 z[13] = sum14; 1997 z[14] = sum15; 1998 1999 if (!usecprow) z += 15; 2000 } 2001 2002 PetscCall(VecRestoreArrayRead(xx, &x)); 2003 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2004 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 2005 PetscFunctionReturn(PETSC_SUCCESS); 2006 } 2007 2008 /* 2009 This will not work with MatScalar == float because it calls the BLAS 2010 */ 2011 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 2012 { 2013 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2014 PetscScalar *z = NULL, *work, *workt, *zarray; 2015 const PetscScalar *x, *xb; 2016 const MatScalar *v; 2017 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2018 const PetscInt *idx, *ii, *ridx = NULL; 2019 PetscInt ncols, k; 2020 PetscBool usecprow = a->compressedrow.use; 2021 2022 PetscFunctionBegin; 2023 PetscCall(VecGetArrayRead(xx, &x)); 2024 PetscCall(VecGetArrayWrite(zz, &zarray)); 2025 2026 idx = a->j; 2027 v = a->a; 2028 if (usecprow) { 2029 mbs = a->compressedrow.nrows; 2030 ii = a->compressedrow.i; 2031 ridx = a->compressedrow.rindex; 2032 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2033 } else { 2034 mbs = a->mbs; 2035 ii = a->i; 2036 z = zarray; 2037 } 2038 2039 if (!a->mult_work) { 2040 k = PetscMax(A->rmap->n, A->cmap->n); 2041 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2042 } 2043 work = a->mult_work; 2044 for (i = 0; i < mbs; i++) { 2045 n = ii[1] - ii[0]; 2046 ii++; 2047 ncols = n * bs; 2048 workt = work; 2049 for (j = 0; j < n; j++) { 2050 xb = x + bs * (*idx++); 2051 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2052 workt += bs; 2053 } 2054 if (usecprow) z = zarray + bs * ridx[i]; 2055 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2056 v += n * bs2; 2057 if (!usecprow) z += bs; 2058 } 2059 PetscCall(VecRestoreArrayRead(xx, &x)); 2060 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2061 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2062 PetscFunctionReturn(PETSC_SUCCESS); 2063 } 2064 2065 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2066 { 2067 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2068 const PetscScalar *x; 2069 PetscScalar *y, *z, sum; 2070 const MatScalar *v; 2071 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2072 const PetscInt *idx, *ii; 2073 PetscBool usecprow = a->compressedrow.use; 2074 2075 PetscFunctionBegin; 2076 PetscCall(VecGetArrayRead(xx, &x)); 2077 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2078 2079 idx = a->j; 2080 v = a->a; 2081 if (usecprow) { 2082 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 2083 mbs = a->compressedrow.nrows; 2084 ii = a->compressedrow.i; 2085 ridx = a->compressedrow.rindex; 2086 } else { 2087 ii = a->i; 2088 } 2089 2090 for (i = 0; i < mbs; i++) { 2091 n = ii[1] - ii[0]; 2092 ii++; 2093 if (!usecprow) { 2094 sum = y[i]; 2095 } else { 2096 sum = y[ridx[i]]; 2097 } 2098 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2099 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2100 PetscSparseDensePlusDot(sum, x, v, idx, n); 2101 v += n; 2102 idx += n; 2103 if (usecprow) { 2104 z[ridx[i]] = sum; 2105 } else { 2106 z[i] = sum; 2107 } 2108 } 2109 PetscCall(VecRestoreArrayRead(xx, &x)); 2110 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2111 PetscCall(PetscLogFlops(2.0 * a->nz)); 2112 PetscFunctionReturn(PETSC_SUCCESS); 2113 } 2114 2115 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2116 { 2117 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2118 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2119 const PetscScalar *x, *xb; 2120 PetscScalar x1, x2, *yarray, *zarray; 2121 const MatScalar *v; 2122 PetscInt mbs = a->mbs, i, n, j; 2123 const PetscInt *idx, *ii, *ridx = NULL; 2124 PetscBool usecprow = a->compressedrow.use; 2125 2126 PetscFunctionBegin; 2127 PetscCall(VecGetArrayRead(xx, &x)); 2128 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2129 2130 idx = a->j; 2131 v = a->a; 2132 if (usecprow) { 2133 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 2134 mbs = a->compressedrow.nrows; 2135 ii = a->compressedrow.i; 2136 ridx = a->compressedrow.rindex; 2137 } else { 2138 ii = a->i; 2139 y = yarray; 2140 z = zarray; 2141 } 2142 2143 for (i = 0; i < mbs; i++) { 2144 n = ii[1] - ii[0]; 2145 ii++; 2146 if (usecprow) { 2147 z = zarray + 2 * ridx[i]; 2148 y = yarray + 2 * ridx[i]; 2149 } 2150 sum1 = y[0]; 2151 sum2 = y[1]; 2152 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2153 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2154 for (j = 0; j < n; j++) { 2155 xb = x + 2 * (*idx++); 2156 x1 = xb[0]; 2157 x2 = xb[1]; 2158 2159 sum1 += v[0] * x1 + v[2] * x2; 2160 sum2 += v[1] * x1 + v[3] * x2; 2161 v += 4; 2162 } 2163 z[0] = sum1; 2164 z[1] = sum2; 2165 if (!usecprow) { 2166 z += 2; 2167 y += 2; 2168 } 2169 } 2170 PetscCall(VecRestoreArrayRead(xx, &x)); 2171 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2172 PetscCall(PetscLogFlops(4.0 * a->nz)); 2173 PetscFunctionReturn(PETSC_SUCCESS); 2174 } 2175 2176 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2177 { 2178 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2179 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2180 const PetscScalar *x, *xb; 2181 const MatScalar *v; 2182 PetscInt mbs = a->mbs, i, j, n; 2183 const PetscInt *idx, *ii, *ridx = NULL; 2184 PetscBool usecprow = a->compressedrow.use; 2185 2186 PetscFunctionBegin; 2187 PetscCall(VecGetArrayRead(xx, &x)); 2188 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2189 2190 idx = a->j; 2191 v = a->a; 2192 if (usecprow) { 2193 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 2194 mbs = a->compressedrow.nrows; 2195 ii = a->compressedrow.i; 2196 ridx = a->compressedrow.rindex; 2197 } else { 2198 ii = a->i; 2199 y = yarray; 2200 z = zarray; 2201 } 2202 2203 for (i = 0; i < mbs; i++) { 2204 n = ii[1] - ii[0]; 2205 ii++; 2206 if (usecprow) { 2207 z = zarray + 3 * ridx[i]; 2208 y = yarray + 3 * ridx[i]; 2209 } 2210 sum1 = y[0]; 2211 sum2 = y[1]; 2212 sum3 = y[2]; 2213 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2214 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2215 for (j = 0; j < n; j++) { 2216 xb = x + 3 * (*idx++); 2217 x1 = xb[0]; 2218 x2 = xb[1]; 2219 x3 = xb[2]; 2220 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2221 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2222 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2223 v += 9; 2224 } 2225 z[0] = sum1; 2226 z[1] = sum2; 2227 z[2] = sum3; 2228 if (!usecprow) { 2229 z += 3; 2230 y += 3; 2231 } 2232 } 2233 PetscCall(VecRestoreArrayRead(xx, &x)); 2234 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2235 PetscCall(PetscLogFlops(18.0 * a->nz)); 2236 PetscFunctionReturn(PETSC_SUCCESS); 2237 } 2238 2239 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2240 { 2241 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2242 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2243 const PetscScalar *x, *xb; 2244 const MatScalar *v; 2245 PetscInt mbs = a->mbs, i, j, n; 2246 const PetscInt *idx, *ii, *ridx = NULL; 2247 PetscBool usecprow = a->compressedrow.use; 2248 2249 PetscFunctionBegin; 2250 PetscCall(VecGetArrayRead(xx, &x)); 2251 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2252 2253 idx = a->j; 2254 v = a->a; 2255 if (usecprow) { 2256 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 2257 mbs = a->compressedrow.nrows; 2258 ii = a->compressedrow.i; 2259 ridx = a->compressedrow.rindex; 2260 } else { 2261 ii = a->i; 2262 y = yarray; 2263 z = zarray; 2264 } 2265 2266 for (i = 0; i < mbs; i++) { 2267 n = ii[1] - ii[0]; 2268 ii++; 2269 if (usecprow) { 2270 z = zarray + 4 * ridx[i]; 2271 y = yarray + 4 * ridx[i]; 2272 } 2273 sum1 = y[0]; 2274 sum2 = y[1]; 2275 sum3 = y[2]; 2276 sum4 = y[3]; 2277 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2278 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2279 for (j = 0; j < n; j++) { 2280 xb = x + 4 * (*idx++); 2281 x1 = xb[0]; 2282 x2 = xb[1]; 2283 x3 = xb[2]; 2284 x4 = xb[3]; 2285 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2286 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2287 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2288 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2289 v += 16; 2290 } 2291 z[0] = sum1; 2292 z[1] = sum2; 2293 z[2] = sum3; 2294 z[3] = sum4; 2295 if (!usecprow) { 2296 z += 4; 2297 y += 4; 2298 } 2299 } 2300 PetscCall(VecRestoreArrayRead(xx, &x)); 2301 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2302 PetscCall(PetscLogFlops(32.0 * a->nz)); 2303 PetscFunctionReturn(PETSC_SUCCESS); 2304 } 2305 2306 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2307 { 2308 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2309 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2310 const PetscScalar *x, *xb; 2311 PetscScalar *yarray, *zarray; 2312 const MatScalar *v; 2313 PetscInt mbs = a->mbs, i, j, n; 2314 const PetscInt *idx, *ii, *ridx = NULL; 2315 PetscBool usecprow = a->compressedrow.use; 2316 2317 PetscFunctionBegin; 2318 PetscCall(VecGetArrayRead(xx, &x)); 2319 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2320 2321 idx = a->j; 2322 v = a->a; 2323 if (usecprow) { 2324 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 2325 mbs = a->compressedrow.nrows; 2326 ii = a->compressedrow.i; 2327 ridx = a->compressedrow.rindex; 2328 } else { 2329 ii = a->i; 2330 y = yarray; 2331 z = zarray; 2332 } 2333 2334 for (i = 0; i < mbs; i++) { 2335 n = ii[1] - ii[0]; 2336 ii++; 2337 if (usecprow) { 2338 z = zarray + 5 * ridx[i]; 2339 y = yarray + 5 * ridx[i]; 2340 } 2341 sum1 = y[0]; 2342 sum2 = y[1]; 2343 sum3 = y[2]; 2344 sum4 = y[3]; 2345 sum5 = y[4]; 2346 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2347 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2348 for (j = 0; j < n; j++) { 2349 xb = x + 5 * (*idx++); 2350 x1 = xb[0]; 2351 x2 = xb[1]; 2352 x3 = xb[2]; 2353 x4 = xb[3]; 2354 x5 = xb[4]; 2355 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2356 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2357 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2358 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2359 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2360 v += 25; 2361 } 2362 z[0] = sum1; 2363 z[1] = sum2; 2364 z[2] = sum3; 2365 z[3] = sum4; 2366 z[4] = sum5; 2367 if (!usecprow) { 2368 z += 5; 2369 y += 5; 2370 } 2371 } 2372 PetscCall(VecRestoreArrayRead(xx, &x)); 2373 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2374 PetscCall(PetscLogFlops(50.0 * a->nz)); 2375 PetscFunctionReturn(PETSC_SUCCESS); 2376 } 2377 2378 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2379 { 2380 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2381 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2382 const PetscScalar *x, *xb; 2383 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2384 const MatScalar *v; 2385 PetscInt mbs = a->mbs, i, j, n; 2386 const PetscInt *idx, *ii, *ridx = NULL; 2387 PetscBool usecprow = a->compressedrow.use; 2388 2389 PetscFunctionBegin; 2390 PetscCall(VecGetArrayRead(xx, &x)); 2391 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2392 2393 idx = a->j; 2394 v = a->a; 2395 if (usecprow) { 2396 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 2397 mbs = a->compressedrow.nrows; 2398 ii = a->compressedrow.i; 2399 ridx = a->compressedrow.rindex; 2400 } else { 2401 ii = a->i; 2402 y = yarray; 2403 z = zarray; 2404 } 2405 2406 for (i = 0; i < mbs; i++) { 2407 n = ii[1] - ii[0]; 2408 ii++; 2409 if (usecprow) { 2410 z = zarray + 6 * ridx[i]; 2411 y = yarray + 6 * ridx[i]; 2412 } 2413 sum1 = y[0]; 2414 sum2 = y[1]; 2415 sum3 = y[2]; 2416 sum4 = y[3]; 2417 sum5 = y[4]; 2418 sum6 = y[5]; 2419 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2420 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2421 for (j = 0; j < n; j++) { 2422 xb = x + 6 * (*idx++); 2423 x1 = xb[0]; 2424 x2 = xb[1]; 2425 x3 = xb[2]; 2426 x4 = xb[3]; 2427 x5 = xb[4]; 2428 x6 = xb[5]; 2429 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2430 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2431 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2432 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2433 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2434 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2435 v += 36; 2436 } 2437 z[0] = sum1; 2438 z[1] = sum2; 2439 z[2] = sum3; 2440 z[3] = sum4; 2441 z[4] = sum5; 2442 z[5] = sum6; 2443 if (!usecprow) { 2444 z += 6; 2445 y += 6; 2446 } 2447 } 2448 PetscCall(VecRestoreArrayRead(xx, &x)); 2449 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2450 PetscCall(PetscLogFlops(72.0 * a->nz)); 2451 PetscFunctionReturn(PETSC_SUCCESS); 2452 } 2453 2454 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2455 { 2456 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2457 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2458 const PetscScalar *x, *xb; 2459 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2460 const MatScalar *v; 2461 PetscInt mbs = a->mbs, i, j, n; 2462 const PetscInt *idx, *ii, *ridx = NULL; 2463 PetscBool usecprow = a->compressedrow.use; 2464 2465 PetscFunctionBegin; 2466 PetscCall(VecGetArrayRead(xx, &x)); 2467 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2468 2469 idx = a->j; 2470 v = a->a; 2471 if (usecprow) { 2472 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2473 mbs = a->compressedrow.nrows; 2474 ii = a->compressedrow.i; 2475 ridx = a->compressedrow.rindex; 2476 } else { 2477 ii = a->i; 2478 y = yarray; 2479 z = zarray; 2480 } 2481 2482 for (i = 0; i < mbs; i++) { 2483 n = ii[1] - ii[0]; 2484 ii++; 2485 if (usecprow) { 2486 z = zarray + 7 * ridx[i]; 2487 y = yarray + 7 * ridx[i]; 2488 } 2489 sum1 = y[0]; 2490 sum2 = y[1]; 2491 sum3 = y[2]; 2492 sum4 = y[3]; 2493 sum5 = y[4]; 2494 sum6 = y[5]; 2495 sum7 = y[6]; 2496 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2497 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2498 for (j = 0; j < n; j++) { 2499 xb = x + 7 * (*idx++); 2500 x1 = xb[0]; 2501 x2 = xb[1]; 2502 x3 = xb[2]; 2503 x4 = xb[3]; 2504 x5 = xb[4]; 2505 x6 = xb[5]; 2506 x7 = xb[6]; 2507 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2508 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2509 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2510 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2511 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2512 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2513 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2514 v += 49; 2515 } 2516 z[0] = sum1; 2517 z[1] = sum2; 2518 z[2] = sum3; 2519 z[3] = sum4; 2520 z[4] = sum5; 2521 z[5] = sum6; 2522 z[6] = sum7; 2523 if (!usecprow) { 2524 z += 7; 2525 y += 7; 2526 } 2527 } 2528 PetscCall(VecRestoreArrayRead(xx, &x)); 2529 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2530 PetscCall(PetscLogFlops(98.0 * a->nz)); 2531 PetscFunctionReturn(PETSC_SUCCESS); 2532 } 2533 2534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2535 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2536 { 2537 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2538 PetscScalar *z = NULL, *work, *workt, *zarray; 2539 const PetscScalar *x, *xb; 2540 const MatScalar *v; 2541 PetscInt mbs, i, j, n; 2542 PetscInt k; 2543 PetscBool usecprow = a->compressedrow.use; 2544 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2545 2546 __m256d a0, a1, a2, a3, a4, a5; 2547 __m256d w0, w1, w2, w3; 2548 __m256d z0, z1, z2; 2549 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2550 2551 PetscFunctionBegin; 2552 PetscCall(VecCopy(yy, zz)); 2553 PetscCall(VecGetArrayRead(xx, &x)); 2554 PetscCall(VecGetArray(zz, &zarray)); 2555 2556 idx = a->j; 2557 v = a->a; 2558 if (usecprow) { 2559 mbs = a->compressedrow.nrows; 2560 ii = a->compressedrow.i; 2561 ridx = a->compressedrow.rindex; 2562 } else { 2563 mbs = a->mbs; 2564 ii = a->i; 2565 z = zarray; 2566 } 2567 2568 if (!a->mult_work) { 2569 k = PetscMax(A->rmap->n, A->cmap->n); 2570 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2571 } 2572 2573 work = a->mult_work; 2574 for (i = 0; i < mbs; i++) { 2575 n = ii[1] - ii[0]; 2576 ii++; 2577 workt = work; 2578 for (j = 0; j < n; j++) { 2579 xb = x + bs * (*idx++); 2580 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2581 workt += bs; 2582 } 2583 if (usecprow) z = zarray + bs * ridx[i]; 2584 2585 z0 = _mm256_loadu_pd(&z[0]); 2586 z1 = _mm256_loadu_pd(&z[4]); 2587 z2 = _mm256_set1_pd(z[8]); 2588 2589 for (j = 0; j < n; j++) { 2590 /* first column of a */ 2591 w0 = _mm256_set1_pd(work[j * 9]); 2592 a0 = _mm256_loadu_pd(&v[j * 81]); 2593 z0 = _mm256_fmadd_pd(a0, w0, z0); 2594 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2595 z1 = _mm256_fmadd_pd(a1, w0, z1); 2596 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2597 z2 = _mm256_fmadd_pd(a2, w0, z2); 2598 2599 /* second column of a */ 2600 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2601 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2602 z0 = _mm256_fmadd_pd(a0, w1, z0); 2603 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2604 z1 = _mm256_fmadd_pd(a1, w1, z1); 2605 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2606 z2 = _mm256_fmadd_pd(a2, w1, z2); 2607 2608 /* third column of a */ 2609 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2610 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2611 z0 = _mm256_fmadd_pd(a3, w2, z0); 2612 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2613 z1 = _mm256_fmadd_pd(a4, w2, z1); 2614 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2615 z2 = _mm256_fmadd_pd(a5, w2, z2); 2616 2617 /* fourth column of a */ 2618 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2619 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2620 z0 = _mm256_fmadd_pd(a0, w3, z0); 2621 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2622 z1 = _mm256_fmadd_pd(a1, w3, z1); 2623 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2624 z2 = _mm256_fmadd_pd(a2, w3, z2); 2625 2626 /* fifth column of a */ 2627 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2628 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2629 z0 = _mm256_fmadd_pd(a3, w0, z0); 2630 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2631 z1 = _mm256_fmadd_pd(a4, w0, z1); 2632 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2633 z2 = _mm256_fmadd_pd(a5, w0, z2); 2634 2635 /* sixth column of a */ 2636 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2637 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2638 z0 = _mm256_fmadd_pd(a0, w1, z0); 2639 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2640 z1 = _mm256_fmadd_pd(a1, w1, z1); 2641 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2642 z2 = _mm256_fmadd_pd(a2, w1, z2); 2643 2644 /* seventh column of a */ 2645 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2646 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2647 z0 = _mm256_fmadd_pd(a0, w2, z0); 2648 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2649 z1 = _mm256_fmadd_pd(a1, w2, z1); 2650 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2651 z2 = _mm256_fmadd_pd(a2, w2, z2); 2652 2653 /* eighth column of a */ 2654 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2655 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2656 z0 = _mm256_fmadd_pd(a3, w3, z0); 2657 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2658 z1 = _mm256_fmadd_pd(a4, w3, z1); 2659 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2660 z2 = _mm256_fmadd_pd(a5, w3, z2); 2661 2662 /* ninth column of a */ 2663 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2664 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2665 z0 = _mm256_fmadd_pd(a0, w0, z0); 2666 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2667 z1 = _mm256_fmadd_pd(a1, w0, z1); 2668 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2669 z2 = _mm256_fmadd_pd(a2, w0, z2); 2670 } 2671 2672 _mm256_storeu_pd(&z[0], z0); 2673 _mm256_storeu_pd(&z[4], z1); 2674 _mm256_maskstore_pd(&z[8], mask1, z2); 2675 2676 v += n * bs2; 2677 if (!usecprow) z += bs; 2678 } 2679 PetscCall(VecRestoreArrayRead(xx, &x)); 2680 PetscCall(VecRestoreArray(zz, &zarray)); 2681 PetscCall(PetscLogFlops(162.0 * a->nz)); 2682 PetscFunctionReturn(PETSC_SUCCESS); 2683 } 2684 #endif 2685 2686 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2687 { 2688 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2689 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2690 const PetscScalar *x, *xb; 2691 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2692 const MatScalar *v; 2693 PetscInt mbs = a->mbs, i, j, n; 2694 const PetscInt *idx, *ii, *ridx = NULL; 2695 PetscBool usecprow = a->compressedrow.use; 2696 2697 PetscFunctionBegin; 2698 PetscCall(VecGetArrayRead(xx, &x)); 2699 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2700 2701 idx = a->j; 2702 v = a->a; 2703 if (usecprow) { 2704 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2705 mbs = a->compressedrow.nrows; 2706 ii = a->compressedrow.i; 2707 ridx = a->compressedrow.rindex; 2708 } else { 2709 ii = a->i; 2710 y = yarray; 2711 z = zarray; 2712 } 2713 2714 for (i = 0; i < mbs; i++) { 2715 n = ii[1] - ii[0]; 2716 ii++; 2717 if (usecprow) { 2718 z = zarray + 11 * ridx[i]; 2719 y = yarray + 11 * ridx[i]; 2720 } 2721 sum1 = y[0]; 2722 sum2 = y[1]; 2723 sum3 = y[2]; 2724 sum4 = y[3]; 2725 sum5 = y[4]; 2726 sum6 = y[5]; 2727 sum7 = y[6]; 2728 sum8 = y[7]; 2729 sum9 = y[8]; 2730 sum10 = y[9]; 2731 sum11 = y[10]; 2732 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2733 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2734 for (j = 0; j < n; j++) { 2735 xb = x + 11 * (*idx++); 2736 x1 = xb[0]; 2737 x2 = xb[1]; 2738 x3 = xb[2]; 2739 x4 = xb[3]; 2740 x5 = xb[4]; 2741 x6 = xb[5]; 2742 x7 = xb[6]; 2743 x8 = xb[7]; 2744 x9 = xb[8]; 2745 x10 = xb[9]; 2746 x11 = xb[10]; 2747 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2748 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2749 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2750 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2751 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2752 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2753 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2754 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2755 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2756 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2757 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2758 v += 121; 2759 } 2760 z[0] = sum1; 2761 z[1] = sum2; 2762 z[2] = sum3; 2763 z[3] = sum4; 2764 z[4] = sum5; 2765 z[5] = sum6; 2766 z[6] = sum7; 2767 z[7] = sum8; 2768 z[8] = sum9; 2769 z[9] = sum10; 2770 z[10] = sum11; 2771 if (!usecprow) { 2772 z += 11; 2773 y += 11; 2774 } 2775 } 2776 PetscCall(VecRestoreArrayRead(xx, &x)); 2777 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2778 PetscCall(PetscLogFlops(242.0 * a->nz)); 2779 PetscFunctionReturn(PETSC_SUCCESS); 2780 } 2781 2782 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2783 { 2784 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2785 PetscScalar *z = NULL, *work, *workt, *zarray; 2786 const PetscScalar *x, *xb; 2787 const MatScalar *v; 2788 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2789 PetscInt ncols, k; 2790 const PetscInt *ridx = NULL, *idx, *ii; 2791 PetscBool usecprow = a->compressedrow.use; 2792 2793 PetscFunctionBegin; 2794 PetscCall(VecCopy(yy, zz)); 2795 PetscCall(VecGetArrayRead(xx, &x)); 2796 PetscCall(VecGetArray(zz, &zarray)); 2797 2798 idx = a->j; 2799 v = a->a; 2800 if (usecprow) { 2801 mbs = a->compressedrow.nrows; 2802 ii = a->compressedrow.i; 2803 ridx = a->compressedrow.rindex; 2804 } else { 2805 mbs = a->mbs; 2806 ii = a->i; 2807 z = zarray; 2808 } 2809 2810 if (!a->mult_work) { 2811 k = PetscMax(A->rmap->n, A->cmap->n); 2812 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2813 } 2814 work = a->mult_work; 2815 for (i = 0; i < mbs; i++) { 2816 n = ii[1] - ii[0]; 2817 ii++; 2818 ncols = n * bs; 2819 workt = work; 2820 for (j = 0; j < n; j++) { 2821 xb = x + bs * (*idx++); 2822 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2823 workt += bs; 2824 } 2825 if (usecprow) z = zarray + bs * ridx[i]; 2826 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2827 v += n * bs2; 2828 if (!usecprow) z += bs; 2829 } 2830 PetscCall(VecRestoreArrayRead(xx, &x)); 2831 PetscCall(VecRestoreArray(zz, &zarray)); 2832 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2833 PetscFunctionReturn(PETSC_SUCCESS); 2834 } 2835 2836 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2837 { 2838 PetscScalar zero = 0.0; 2839 2840 PetscFunctionBegin; 2841 PetscCall(VecSet(zz, zero)); 2842 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2843 PetscFunctionReturn(PETSC_SUCCESS); 2844 } 2845 2846 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2847 { 2848 PetscScalar zero = 0.0; 2849 2850 PetscFunctionBegin; 2851 PetscCall(VecSet(zz, zero)); 2852 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2853 PetscFunctionReturn(PETSC_SUCCESS); 2854 } 2855 2856 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2857 { 2858 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2859 PetscScalar *z, x1, x2, x3, x4, x5; 2860 const PetscScalar *x, *xb = NULL; 2861 const MatScalar *v; 2862 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2863 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2864 Mat_CompressedRow cprow = a->compressedrow; 2865 PetscBool usecprow = cprow.use; 2866 2867 PetscFunctionBegin; 2868 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2869 PetscCall(VecGetArrayRead(xx, &x)); 2870 PetscCall(VecGetArray(zz, &z)); 2871 2872 idx = a->j; 2873 v = a->a; 2874 if (usecprow) { 2875 mbs = cprow.nrows; 2876 ii = cprow.i; 2877 ridx = cprow.rindex; 2878 } else { 2879 mbs = a->mbs; 2880 ii = a->i; 2881 xb = x; 2882 } 2883 2884 switch (bs) { 2885 case 1: 2886 for (i = 0; i < mbs; i++) { 2887 if (usecprow) xb = x + ridx[i]; 2888 x1 = xb[0]; 2889 ib = idx + ii[0]; 2890 n = ii[1] - ii[0]; 2891 ii++; 2892 for (j = 0; j < n; j++) { 2893 rval = ib[j]; 2894 z[rval] += PetscConj(*v) * x1; 2895 v++; 2896 } 2897 if (!usecprow) xb++; 2898 } 2899 break; 2900 case 2: 2901 for (i = 0; i < mbs; i++) { 2902 if (usecprow) xb = x + 2 * ridx[i]; 2903 x1 = xb[0]; 2904 x2 = xb[1]; 2905 ib = idx + ii[0]; 2906 n = ii[1] - ii[0]; 2907 ii++; 2908 for (j = 0; j < n; j++) { 2909 rval = ib[j] * 2; 2910 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2911 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2912 v += 4; 2913 } 2914 if (!usecprow) xb += 2; 2915 } 2916 break; 2917 case 3: 2918 for (i = 0; i < mbs; i++) { 2919 if (usecprow) xb = x + 3 * ridx[i]; 2920 x1 = xb[0]; 2921 x2 = xb[1]; 2922 x3 = xb[2]; 2923 ib = idx + ii[0]; 2924 n = ii[1] - ii[0]; 2925 ii++; 2926 for (j = 0; j < n; j++) { 2927 rval = ib[j] * 3; 2928 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2929 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2930 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2931 v += 9; 2932 } 2933 if (!usecprow) xb += 3; 2934 } 2935 break; 2936 case 4: 2937 for (i = 0; i < mbs; i++) { 2938 if (usecprow) xb = x + 4 * ridx[i]; 2939 x1 = xb[0]; 2940 x2 = xb[1]; 2941 x3 = xb[2]; 2942 x4 = xb[3]; 2943 ib = idx + ii[0]; 2944 n = ii[1] - ii[0]; 2945 ii++; 2946 for (j = 0; j < n; j++) { 2947 rval = ib[j] * 4; 2948 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2949 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2950 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2951 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2952 v += 16; 2953 } 2954 if (!usecprow) xb += 4; 2955 } 2956 break; 2957 case 5: 2958 for (i = 0; i < mbs; i++) { 2959 if (usecprow) xb = x + 5 * ridx[i]; 2960 x1 = xb[0]; 2961 x2 = xb[1]; 2962 x3 = xb[2]; 2963 x4 = xb[3]; 2964 x5 = xb[4]; 2965 ib = idx + ii[0]; 2966 n = ii[1] - ii[0]; 2967 ii++; 2968 for (j = 0; j < n; j++) { 2969 rval = ib[j] * 5; 2970 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2971 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2972 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2973 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2974 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2975 v += 25; 2976 } 2977 if (!usecprow) xb += 5; 2978 } 2979 break; 2980 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2981 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2982 #if 0 2983 { 2984 PetscInt ncols,k,bs2=a->bs2; 2985 PetscScalar *work,*workt,zb; 2986 const PetscScalar *xtmp; 2987 if (!a->mult_work) { 2988 k = PetscMax(A->rmap->n,A->cmap->n); 2989 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2990 } 2991 work = a->mult_work; 2992 xtmp = x; 2993 for (i=0; i<mbs; i++) { 2994 n = ii[1] - ii[0]; ii++; 2995 ncols = n*bs; 2996 PetscCall(PetscArrayzero(work,ncols)); 2997 if (usecprow) xtmp = x + bs*ridx[i]; 2998 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2999 v += n*bs2; 3000 if (!usecprow) xtmp += bs; 3001 workt = work; 3002 for (j=0; j<n; j++) { 3003 zb = z + bs*(*idx++); 3004 for (k=0; k<bs; k++) zb[k] += workt[k] ; 3005 workt += bs; 3006 } 3007 } 3008 } 3009 #endif 3010 } 3011 PetscCall(VecRestoreArrayRead(xx, &x)); 3012 PetscCall(VecRestoreArray(zz, &z)); 3013 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3014 PetscFunctionReturn(PETSC_SUCCESS); 3015 } 3016 3017 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 3018 { 3019 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3020 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 3021 const PetscScalar *x, *xb = NULL; 3022 const MatScalar *v; 3023 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3024 const PetscInt *idx, *ii, *ib, *ridx = NULL; 3025 Mat_CompressedRow cprow = a->compressedrow; 3026 PetscBool usecprow = cprow.use; 3027 3028 PetscFunctionBegin; 3029 if (yy != zz) PetscCall(VecCopy(yy, zz)); 3030 PetscCall(VecGetArrayRead(xx, &x)); 3031 PetscCall(VecGetArray(zz, &z)); 3032 3033 idx = a->j; 3034 v = a->a; 3035 if (usecprow) { 3036 mbs = cprow.nrows; 3037 ii = cprow.i; 3038 ridx = cprow.rindex; 3039 } else { 3040 mbs = a->mbs; 3041 ii = a->i; 3042 xb = x; 3043 } 3044 3045 switch (bs) { 3046 case 1: 3047 for (i = 0; i < mbs; i++) { 3048 if (usecprow) xb = x + ridx[i]; 3049 x1 = xb[0]; 3050 ib = idx + ii[0]; 3051 n = ii[1] - ii[0]; 3052 ii++; 3053 for (j = 0; j < n; j++) { 3054 rval = ib[j]; 3055 z[rval] += *v * x1; 3056 v++; 3057 } 3058 if (!usecprow) xb++; 3059 } 3060 break; 3061 case 2: 3062 for (i = 0; i < mbs; i++) { 3063 if (usecprow) xb = x + 2 * ridx[i]; 3064 x1 = xb[0]; 3065 x2 = xb[1]; 3066 ib = idx + ii[0]; 3067 n = ii[1] - ii[0]; 3068 ii++; 3069 for (j = 0; j < n; j++) { 3070 rval = ib[j] * 2; 3071 z[rval++] += v[0] * x1 + v[1] * x2; 3072 z[rval++] += v[2] * x1 + v[3] * x2; 3073 v += 4; 3074 } 3075 if (!usecprow) xb += 2; 3076 } 3077 break; 3078 case 3: 3079 for (i = 0; i < mbs; i++) { 3080 if (usecprow) xb = x + 3 * ridx[i]; 3081 x1 = xb[0]; 3082 x2 = xb[1]; 3083 x3 = xb[2]; 3084 ib = idx + ii[0]; 3085 n = ii[1] - ii[0]; 3086 ii++; 3087 for (j = 0; j < n; j++) { 3088 rval = ib[j] * 3; 3089 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3090 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3091 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3092 v += 9; 3093 } 3094 if (!usecprow) xb += 3; 3095 } 3096 break; 3097 case 4: 3098 for (i = 0; i < mbs; i++) { 3099 if (usecprow) xb = x + 4 * ridx[i]; 3100 x1 = xb[0]; 3101 x2 = xb[1]; 3102 x3 = xb[2]; 3103 x4 = xb[3]; 3104 ib = idx + ii[0]; 3105 n = ii[1] - ii[0]; 3106 ii++; 3107 for (j = 0; j < n; j++) { 3108 rval = ib[j] * 4; 3109 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3110 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3111 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3112 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3113 v += 16; 3114 } 3115 if (!usecprow) xb += 4; 3116 } 3117 break; 3118 case 5: 3119 for (i = 0; i < mbs; i++) { 3120 if (usecprow) xb = x + 5 * ridx[i]; 3121 x1 = xb[0]; 3122 x2 = xb[1]; 3123 x3 = xb[2]; 3124 x4 = xb[3]; 3125 x5 = xb[4]; 3126 ib = idx + ii[0]; 3127 n = ii[1] - ii[0]; 3128 ii++; 3129 for (j = 0; j < n; j++) { 3130 rval = ib[j] * 5; 3131 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3132 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3133 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3134 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3135 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3136 v += 25; 3137 } 3138 if (!usecprow) xb += 5; 3139 } 3140 break; 3141 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3142 PetscInt ncols, k; 3143 PetscScalar *work, *workt; 3144 const PetscScalar *xtmp; 3145 if (!a->mult_work) { 3146 k = PetscMax(A->rmap->n, A->cmap->n); 3147 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3148 } 3149 work = a->mult_work; 3150 xtmp = x; 3151 for (i = 0; i < mbs; i++) { 3152 n = ii[1] - ii[0]; 3153 ii++; 3154 ncols = n * bs; 3155 PetscCall(PetscArrayzero(work, ncols)); 3156 if (usecprow) xtmp = x + bs * ridx[i]; 3157 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3158 v += n * bs2; 3159 if (!usecprow) xtmp += bs; 3160 workt = work; 3161 for (j = 0; j < n; j++) { 3162 zb = z + bs * (*idx++); 3163 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3164 workt += bs; 3165 } 3166 } 3167 } 3168 } 3169 PetscCall(VecRestoreArrayRead(xx, &x)); 3170 PetscCall(VecRestoreArray(zz, &z)); 3171 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3172 PetscFunctionReturn(PETSC_SUCCESS); 3173 } 3174 3175 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3176 { 3177 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3178 PetscInt totalnz = a->bs2 * a->nz; 3179 PetscScalar oalpha = alpha; 3180 PetscBLASInt one = 1, tnz; 3181 3182 PetscFunctionBegin; 3183 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3184 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3185 PetscCall(PetscLogFlops(totalnz)); 3186 PetscFunctionReturn(PETSC_SUCCESS); 3187 } 3188 3189 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3190 { 3191 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3192 MatScalar *v = a->a; 3193 PetscReal sum = 0.0; 3194 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3195 3196 PetscFunctionBegin; 3197 if (type == NORM_FROBENIUS) { 3198 #if defined(PETSC_USE_REAL___FP16) 3199 PetscBLASInt one = 1, cnt = bs2 * nz; 3200 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3201 #else 3202 for (i = 0; i < bs2 * nz; i++) { 3203 sum += PetscRealPart(PetscConj(*v) * (*v)); 3204 v++; 3205 } 3206 #endif 3207 *norm = PetscSqrtReal(sum); 3208 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3209 } else if (type == NORM_1) { /* maximum column sum */ 3210 PetscReal *tmp; 3211 PetscInt *bcol = a->j; 3212 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3213 for (i = 0; i < nz; i++) { 3214 for (j = 0; j < bs; j++) { 3215 k1 = bs * (*bcol) + j; /* column index */ 3216 for (k = 0; k < bs; k++) { 3217 tmp[k1] += PetscAbsScalar(*v); 3218 v++; 3219 } 3220 } 3221 bcol++; 3222 } 3223 *norm = 0.0; 3224 for (j = 0; j < A->cmap->n; j++) { 3225 if (tmp[j] > *norm) *norm = tmp[j]; 3226 } 3227 PetscCall(PetscFree(tmp)); 3228 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3229 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3230 *norm = 0.0; 3231 for (k = 0; k < bs; k++) { 3232 for (j = 0; j < a->mbs; j++) { 3233 v = a->a + bs2 * a->i[j] + k; 3234 sum = 0.0; 3235 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3236 for (k1 = 0; k1 < bs; k1++) { 3237 sum += PetscAbsScalar(*v); 3238 v += bs; 3239 } 3240 } 3241 if (sum > *norm) *norm = sum; 3242 } 3243 } 3244 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3245 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3246 PetscFunctionReturn(PETSC_SUCCESS); 3247 } 3248 3249 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3250 { 3251 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3252 3253 PetscFunctionBegin; 3254 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3255 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3256 *flg = PETSC_FALSE; 3257 PetscFunctionReturn(PETSC_SUCCESS); 3258 } 3259 3260 /* if the a->i are the same */ 3261 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3262 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3263 3264 /* if a->j are the same */ 3265 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3266 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3267 3268 /* if a->a are the same */ 3269 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3270 PetscFunctionReturn(PETSC_SUCCESS); 3271 } 3272 3273 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3274 { 3275 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3276 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3277 PetscScalar *x, zero = 0.0; 3278 MatScalar *aa, *aa_j; 3279 3280 PetscFunctionBegin; 3281 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3282 bs = A->rmap->bs; 3283 aa = a->a; 3284 ai = a->i; 3285 aj = a->j; 3286 ambs = a->mbs; 3287 bs2 = a->bs2; 3288 3289 PetscCall(VecSet(v, zero)); 3290 PetscCall(VecGetArray(v, &x)); 3291 PetscCall(VecGetLocalSize(v, &n)); 3292 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3293 for (i = 0; i < ambs; i++) { 3294 for (j = ai[i]; j < ai[i + 1]; j++) { 3295 if (aj[j] == i) { 3296 row = i * bs; 3297 aa_j = aa + j * bs2; 3298 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3299 break; 3300 } 3301 } 3302 } 3303 PetscCall(VecRestoreArray(v, &x)); 3304 PetscFunctionReturn(PETSC_SUCCESS); 3305 } 3306 3307 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3308 { 3309 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3310 const PetscScalar *l, *r, *li, *ri; 3311 PetscScalar x; 3312 MatScalar *aa, *v; 3313 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3314 const PetscInt *ai, *aj; 3315 3316 PetscFunctionBegin; 3317 ai = a->i; 3318 aj = a->j; 3319 aa = a->a; 3320 m = A->rmap->n; 3321 n = A->cmap->n; 3322 bs = A->rmap->bs; 3323 mbs = a->mbs; 3324 bs2 = a->bs2; 3325 if (ll) { 3326 PetscCall(VecGetArrayRead(ll, &l)); 3327 PetscCall(VecGetLocalSize(ll, &lm)); 3328 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3329 for (i = 0; i < mbs; i++) { /* for each block row */ 3330 M = ai[i + 1] - ai[i]; 3331 li = l + i * bs; 3332 v = aa + bs2 * ai[i]; 3333 for (j = 0; j < M; j++) { /* for each block */ 3334 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 3335 } 3336 } 3337 PetscCall(VecRestoreArrayRead(ll, &l)); 3338 PetscCall(PetscLogFlops(a->nz)); 3339 } 3340 3341 if (rr) { 3342 PetscCall(VecGetArrayRead(rr, &r)); 3343 PetscCall(VecGetLocalSize(rr, &rn)); 3344 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3345 for (i = 0; i < mbs; i++) { /* for each block row */ 3346 iai = ai[i]; 3347 M = ai[i + 1] - iai; 3348 v = aa + bs2 * iai; 3349 for (j = 0; j < M; j++) { /* for each block */ 3350 ri = r + bs * aj[iai + j]; 3351 for (k = 0; k < bs; k++) { 3352 x = ri[k]; 3353 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3354 v += bs; 3355 } 3356 } 3357 } 3358 PetscCall(VecRestoreArrayRead(rr, &r)); 3359 PetscCall(PetscLogFlops(a->nz)); 3360 } 3361 PetscFunctionReturn(PETSC_SUCCESS); 3362 } 3363 3364 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3365 { 3366 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3367 3368 PetscFunctionBegin; 3369 info->block_size = a->bs2; 3370 info->nz_allocated = a->bs2 * a->maxnz; 3371 info->nz_used = a->bs2 * a->nz; 3372 info->nz_unneeded = info->nz_allocated - info->nz_used; 3373 info->assemblies = A->num_ass; 3374 info->mallocs = A->info.mallocs; 3375 info->memory = 0; /* REVIEW ME */ 3376 if (A->factortype) { 3377 info->fill_ratio_given = A->info.fill_ratio_given; 3378 info->fill_ratio_needed = A->info.fill_ratio_needed; 3379 info->factor_mallocs = A->info.factor_mallocs; 3380 } else { 3381 info->fill_ratio_given = 0; 3382 info->fill_ratio_needed = 0; 3383 info->factor_mallocs = 0; 3384 } 3385 PetscFunctionReturn(PETSC_SUCCESS); 3386 } 3387 3388 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3389 { 3390 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3391 3392 PetscFunctionBegin; 3393 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3394 PetscFunctionReturn(PETSC_SUCCESS); 3395 } 3396 3397 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3398 { 3399 PetscFunctionBegin; 3400 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3401 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3402 PetscFunctionReturn(PETSC_SUCCESS); 3403 } 3404 3405 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3406 { 3407 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3408 PetscScalar *z = NULL, sum1; 3409 const PetscScalar *xb; 3410 PetscScalar x1; 3411 const MatScalar *v, *vv; 3412 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3413 PetscBool usecprow = a->compressedrow.use; 3414 3415 PetscFunctionBegin; 3416 idx = a->j; 3417 v = a->a; 3418 if (usecprow) { 3419 mbs = a->compressedrow.nrows; 3420 ii = a->compressedrow.i; 3421 ridx = a->compressedrow.rindex; 3422 } else { 3423 mbs = a->mbs; 3424 ii = a->i; 3425 z = c; 3426 } 3427 3428 for (i = 0; i < mbs; i++) { 3429 n = ii[1] - ii[0]; 3430 ii++; 3431 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3432 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3433 if (usecprow) z = c + ridx[i]; 3434 jj = idx; 3435 vv = v; 3436 for (k = 0; k < cn; k++) { 3437 idx = jj; 3438 v = vv; 3439 sum1 = 0.0; 3440 for (j = 0; j < n; j++) { 3441 xb = b + (*idx++); 3442 x1 = xb[0 + k * bm]; 3443 sum1 += v[0] * x1; 3444 v += 1; 3445 } 3446 z[0 + k * cm] = sum1; 3447 } 3448 if (!usecprow) z += 1; 3449 } 3450 PetscFunctionReturn(PETSC_SUCCESS); 3451 } 3452 3453 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3454 { 3455 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3456 PetscScalar *z = NULL, sum1, sum2; 3457 const PetscScalar *xb; 3458 PetscScalar x1, x2; 3459 const MatScalar *v, *vv; 3460 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3461 PetscBool usecprow = a->compressedrow.use; 3462 3463 PetscFunctionBegin; 3464 idx = a->j; 3465 v = a->a; 3466 if (usecprow) { 3467 mbs = a->compressedrow.nrows; 3468 ii = a->compressedrow.i; 3469 ridx = a->compressedrow.rindex; 3470 } else { 3471 mbs = a->mbs; 3472 ii = a->i; 3473 z = c; 3474 } 3475 3476 for (i = 0; i < mbs; i++) { 3477 n = ii[1] - ii[0]; 3478 ii++; 3479 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3480 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3481 if (usecprow) z = c + 2 * ridx[i]; 3482 jj = idx; 3483 vv = v; 3484 for (k = 0; k < cn; k++) { 3485 idx = jj; 3486 v = vv; 3487 sum1 = 0.0; 3488 sum2 = 0.0; 3489 for (j = 0; j < n; j++) { 3490 xb = b + 2 * (*idx++); 3491 x1 = xb[0 + k * bm]; 3492 x2 = xb[1 + k * bm]; 3493 sum1 += v[0] * x1 + v[2] * x2; 3494 sum2 += v[1] * x1 + v[3] * x2; 3495 v += 4; 3496 } 3497 z[0 + k * cm] = sum1; 3498 z[1 + k * cm] = sum2; 3499 } 3500 if (!usecprow) z += 2; 3501 } 3502 PetscFunctionReturn(PETSC_SUCCESS); 3503 } 3504 3505 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3506 { 3507 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3508 PetscScalar *z = NULL, sum1, sum2, sum3; 3509 const PetscScalar *xb; 3510 PetscScalar x1, x2, x3; 3511 const MatScalar *v, *vv; 3512 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3513 PetscBool usecprow = a->compressedrow.use; 3514 3515 PetscFunctionBegin; 3516 idx = a->j; 3517 v = a->a; 3518 if (usecprow) { 3519 mbs = a->compressedrow.nrows; 3520 ii = a->compressedrow.i; 3521 ridx = a->compressedrow.rindex; 3522 } else { 3523 mbs = a->mbs; 3524 ii = a->i; 3525 z = c; 3526 } 3527 3528 for (i = 0; i < mbs; i++) { 3529 n = ii[1] - ii[0]; 3530 ii++; 3531 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3532 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3533 if (usecprow) z = c + 3 * ridx[i]; 3534 jj = idx; 3535 vv = v; 3536 for (k = 0; k < cn; k++) { 3537 idx = jj; 3538 v = vv; 3539 sum1 = 0.0; 3540 sum2 = 0.0; 3541 sum3 = 0.0; 3542 for (j = 0; j < n; j++) { 3543 xb = b + 3 * (*idx++); 3544 x1 = xb[0 + k * bm]; 3545 x2 = xb[1 + k * bm]; 3546 x3 = xb[2 + k * bm]; 3547 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3548 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3549 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3550 v += 9; 3551 } 3552 z[0 + k * cm] = sum1; 3553 z[1 + k * cm] = sum2; 3554 z[2 + k * cm] = sum3; 3555 } 3556 if (!usecprow) z += 3; 3557 } 3558 PetscFunctionReturn(PETSC_SUCCESS); 3559 } 3560 3561 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3562 { 3563 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3564 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3565 const PetscScalar *xb; 3566 PetscScalar x1, x2, x3, x4; 3567 const MatScalar *v, *vv; 3568 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3569 PetscBool usecprow = a->compressedrow.use; 3570 3571 PetscFunctionBegin; 3572 idx = a->j; 3573 v = a->a; 3574 if (usecprow) { 3575 mbs = a->compressedrow.nrows; 3576 ii = a->compressedrow.i; 3577 ridx = a->compressedrow.rindex; 3578 } else { 3579 mbs = a->mbs; 3580 ii = a->i; 3581 z = c; 3582 } 3583 3584 for (i = 0; i < mbs; i++) { 3585 n = ii[1] - ii[0]; 3586 ii++; 3587 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3588 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3589 if (usecprow) z = c + 4 * ridx[i]; 3590 jj = idx; 3591 vv = v; 3592 for (k = 0; k < cn; k++) { 3593 idx = jj; 3594 v = vv; 3595 sum1 = 0.0; 3596 sum2 = 0.0; 3597 sum3 = 0.0; 3598 sum4 = 0.0; 3599 for (j = 0; j < n; j++) { 3600 xb = b + 4 * (*idx++); 3601 x1 = xb[0 + k * bm]; 3602 x2 = xb[1 + k * bm]; 3603 x3 = xb[2 + k * bm]; 3604 x4 = xb[3 + k * bm]; 3605 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3606 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3607 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3608 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3609 v += 16; 3610 } 3611 z[0 + k * cm] = sum1; 3612 z[1 + k * cm] = sum2; 3613 z[2 + k * cm] = sum3; 3614 z[3 + k * cm] = sum4; 3615 } 3616 if (!usecprow) z += 4; 3617 } 3618 PetscFunctionReturn(PETSC_SUCCESS); 3619 } 3620 3621 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3622 { 3623 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3624 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3625 const PetscScalar *xb; 3626 PetscScalar x1, x2, x3, x4, x5; 3627 const MatScalar *v, *vv; 3628 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3629 PetscBool usecprow = a->compressedrow.use; 3630 3631 PetscFunctionBegin; 3632 idx = a->j; 3633 v = a->a; 3634 if (usecprow) { 3635 mbs = a->compressedrow.nrows; 3636 ii = a->compressedrow.i; 3637 ridx = a->compressedrow.rindex; 3638 } else { 3639 mbs = a->mbs; 3640 ii = a->i; 3641 z = c; 3642 } 3643 3644 for (i = 0; i < mbs; i++) { 3645 n = ii[1] - ii[0]; 3646 ii++; 3647 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3648 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3649 if (usecprow) z = c + 5 * ridx[i]; 3650 jj = idx; 3651 vv = v; 3652 for (k = 0; k < cn; k++) { 3653 idx = jj; 3654 v = vv; 3655 sum1 = 0.0; 3656 sum2 = 0.0; 3657 sum3 = 0.0; 3658 sum4 = 0.0; 3659 sum5 = 0.0; 3660 for (j = 0; j < n; j++) { 3661 xb = b + 5 * (*idx++); 3662 x1 = xb[0 + k * bm]; 3663 x2 = xb[1 + k * bm]; 3664 x3 = xb[2 + k * bm]; 3665 x4 = xb[3 + k * bm]; 3666 x5 = xb[4 + k * bm]; 3667 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3668 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3669 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3670 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3671 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3672 v += 25; 3673 } 3674 z[0 + k * cm] = sum1; 3675 z[1 + k * cm] = sum2; 3676 z[2 + k * cm] = sum3; 3677 z[3 + k * cm] = sum4; 3678 z[4 + k * cm] = sum5; 3679 } 3680 if (!usecprow) z += 5; 3681 } 3682 PetscFunctionReturn(PETSC_SUCCESS); 3683 } 3684 3685 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3686 { 3687 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3688 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3689 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3690 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3691 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3692 PetscBLASInt bbs, bcn, bbm, bcm; 3693 PetscScalar *z = NULL; 3694 PetscScalar *c, *b; 3695 const MatScalar *v; 3696 const PetscInt *idx, *ii, *ridx = NULL; 3697 PetscScalar _DZero = 0.0, _DOne = 1.0; 3698 PetscBool usecprow = a->compressedrow.use; 3699 3700 PetscFunctionBegin; 3701 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 3702 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3703 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3704 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3705 b = bd->v; 3706 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 3707 PetscCall(MatDenseGetArray(C, &c)); 3708 switch (bs) { 3709 case 1: 3710 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3711 break; 3712 case 2: 3713 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3714 break; 3715 case 3: 3716 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3717 break; 3718 case 4: 3719 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3720 break; 3721 case 5: 3722 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3723 break; 3724 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3725 PetscCall(PetscBLASIntCast(bs, &bbs)); 3726 PetscCall(PetscBLASIntCast(cn, &bcn)); 3727 PetscCall(PetscBLASIntCast(bm, &bbm)); 3728 PetscCall(PetscBLASIntCast(cm, &bcm)); 3729 idx = a->j; 3730 v = a->a; 3731 if (usecprow) { 3732 mbs = a->compressedrow.nrows; 3733 ii = a->compressedrow.i; 3734 ridx = a->compressedrow.rindex; 3735 } else { 3736 mbs = a->mbs; 3737 ii = a->i; 3738 z = c; 3739 } 3740 for (i = 0; i < mbs; i++) { 3741 n = ii[1] - ii[0]; 3742 ii++; 3743 if (usecprow) z = c + bs * ridx[i]; 3744 if (n) { 3745 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3746 v += bs2; 3747 } 3748 for (j = 1; j < n; j++) { 3749 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3750 v += bs2; 3751 } 3752 if (!usecprow) z += bs; 3753 } 3754 } 3755 PetscCall(MatDenseRestoreArray(C, &c)); 3756 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3757 PetscFunctionReturn(PETSC_SUCCESS); 3758 } 3759