1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 15 const PetscInt *idx; 16 PetscInt start, end, *ai, *aj, bs; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m, &table)); 28 PetscCall(PetscMalloc1(m + 1, &nidx)); 29 30 for (i = 0; i < is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 PetscCall(PetscBTMemzero(m, table)); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 PetscCall(ISGetIndices(is[i], &idx)); 37 PetscCall(ISGetLocalSize(is[i], &n)); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j = 0; j < n; ++j) { 41 ival = idx[j] / bs; /* convert the indices into block indices */ 42 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 43 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44 } 45 PetscCall(ISRestoreIndices(is[i], &idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j = 0; j < ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row + 1]; 55 for (l = start; l < end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58 } 59 } 60 } 61 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 62 } 63 PetscCall(PetscBTDestroy(&table)); 64 PetscCall(PetscFree(nidx)); 65 PetscFunctionReturn(PETSC_SUCCESS); 66 } 67 68 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 69 { 70 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 71 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 72 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 73 const PetscInt *irow, *icol; 74 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 75 PetscInt *aj = a->j, *ai = a->i; 76 MatScalar *mat_a; 77 Mat C; 78 PetscBool flag; 79 80 PetscFunctionBegin; 81 PetscCall(ISGetIndices(isrow, &irow)); 82 PetscCall(ISGetIndices(iscol, &icol)); 83 PetscCall(ISGetLocalSize(isrow, &nrows)); 84 PetscCall(ISGetLocalSize(iscol, &ncols)); 85 86 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 87 ssmap = smap; 88 PetscCall(PetscMalloc1(1 + nrows, &lens)); 89 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 90 /* determine lens of each row */ 91 for (i = 0; i < nrows; i++) { 92 kstart = ai[irow[i]]; 93 kend = kstart + a->ilen[irow[i]]; 94 lens[i] = 0; 95 for (k = kstart; k < kend; k++) { 96 if (ssmap[aj[k]]) lens[i]++; 97 } 98 } 99 /* Create and fill new matrix */ 100 if (scall == MAT_REUSE_MATRIX) { 101 c = (Mat_SeqBAIJ *)((*B)->data); 102 103 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 104 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 105 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 106 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 107 C = *B; 108 } else { 109 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 110 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 111 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 112 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 113 } 114 c = (Mat_SeqBAIJ *)(C->data); 115 for (i = 0; i < nrows; i++) { 116 row = irow[i]; 117 kstart = ai[row]; 118 kend = kstart + a->ilen[row]; 119 mat_i = c->i[i]; 120 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 121 mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 122 mat_ilen = c->ilen + i; 123 for (k = kstart; k < kend; k++) { 124 if ((tcol = ssmap[a->j[k]])) { 125 *mat_j++ = tcol - 1; 126 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 127 mat_a += bs2; 128 (*mat_ilen)++; 129 } 130 } 131 } 132 /* sort */ 133 if (c->j && c->a) { 134 MatScalar *work; 135 PetscCall(PetscMalloc1(bs2, &work)); 136 for (i = 0; i < nrows; i++) { 137 PetscInt ilen; 138 mat_i = c->i[i]; 139 mat_j = c->j + mat_i; 140 mat_a = c->a + mat_i * bs2; 141 ilen = c->ilen[i]; 142 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 143 } 144 PetscCall(PetscFree(work)); 145 } 146 147 /* Free work space */ 148 PetscCall(ISRestoreIndices(iscol, &icol)); 149 PetscCall(PetscFree(smap)); 150 PetscCall(PetscFree(lens)); 151 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 152 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 153 154 PetscCall(ISRestoreIndices(isrow, &irow)); 155 *B = C; 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 160 { 161 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 162 IS is1, is2; 163 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 164 const PetscInt *irow, *icol; 165 166 PetscFunctionBegin; 167 PetscCall(ISGetIndices(isrow, &irow)); 168 PetscCall(ISGetIndices(iscol, &icol)); 169 PetscCall(ISGetLocalSize(isrow, &nrows)); 170 PetscCall(ISGetLocalSize(iscol, &ncols)); 171 172 /* Verify if the indices correspond to each element in a block 173 and form the IS with compressed IS */ 174 maxmnbs = PetscMax(a->mbs, a->nbs); 175 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 176 PetscCall(PetscArrayzero(vary, a->mbs)); 177 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 178 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 179 count = 0; 180 for (i = 0; i < nrows; i++) { 181 j = irow[i] / bs; 182 if ((vary[j]--) == bs) iary[count++] = j; 183 } 184 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 185 186 PetscCall(PetscArrayzero(vary, a->nbs)); 187 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 188 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 189 count = 0; 190 for (i = 0; i < ncols; i++) { 191 j = icol[i] / bs; 192 if ((vary[j]--) == bs) iary[count++] = j; 193 } 194 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 195 PetscCall(ISRestoreIndices(isrow, &irow)); 196 PetscCall(ISRestoreIndices(iscol, &icol)); 197 PetscCall(PetscFree2(vary, iary)); 198 199 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 200 PetscCall(ISDestroy(&is1)); 201 PetscCall(ISDestroy(&is2)); 202 PetscFunctionReturn(PETSC_SUCCESS); 203 } 204 205 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 206 { 207 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 208 Mat_SubSppt *submatj = c->submatis1; 209 210 PetscFunctionBegin; 211 PetscCall((*submatj->destroy)(C)); 212 PetscCall(MatDestroySubMatrix_Private(submatj)); 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 217 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 218 { 219 PetscInt i; 220 Mat C; 221 Mat_SeqBAIJ *c; 222 Mat_SubSppt *submatj; 223 224 PetscFunctionBegin; 225 for (i = 0; i < n; i++) { 226 C = (*mat)[i]; 227 c = (Mat_SeqBAIJ *)C->data; 228 submatj = c->submatis1; 229 if (submatj) { 230 if (--((PetscObject)C)->refct <= 0) { 231 PetscCall(PetscFree(C->factorprefix)); 232 PetscCall((*submatj->destroy)(C)); 233 PetscCall(MatDestroySubMatrix_Private(submatj)); 234 PetscCall(PetscFree(C->defaultvectype)); 235 PetscCall(PetscFree(C->defaultrandtype)); 236 PetscCall(PetscLayoutDestroy(&C->rmap)); 237 PetscCall(PetscLayoutDestroy(&C->cmap)); 238 PetscCall(PetscHeaderDestroy(&C)); 239 } 240 } else { 241 PetscCall(MatDestroy(&C)); 242 } 243 } 244 245 /* Destroy Dummy submatrices created for reuse */ 246 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 247 248 PetscCall(PetscFree(*mat)); 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 253 { 254 PetscInt i; 255 256 PetscFunctionBegin; 257 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 258 259 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 260 PetscFunctionReturn(PETSC_SUCCESS); 261 } 262 263 /* Should check that shapes of vectors and matrices match */ 264 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 265 { 266 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 267 PetscScalar *z, sum; 268 const PetscScalar *x; 269 const MatScalar *v; 270 PetscInt mbs, i, n; 271 const PetscInt *idx, *ii, *ridx = NULL; 272 PetscBool usecprow = a->compressedrow.use; 273 274 PetscFunctionBegin; 275 PetscCall(VecGetArrayRead(xx, &x)); 276 PetscCall(VecGetArrayWrite(zz, &z)); 277 278 if (usecprow) { 279 mbs = a->compressedrow.nrows; 280 ii = a->compressedrow.i; 281 ridx = a->compressedrow.rindex; 282 PetscCall(PetscArrayzero(z, a->mbs)); 283 } else { 284 mbs = a->mbs; 285 ii = a->i; 286 } 287 288 for (i = 0; i < mbs; i++) { 289 n = ii[1] - ii[0]; 290 v = a->a + ii[0]; 291 idx = a->j + ii[0]; 292 ii++; 293 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 294 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 295 sum = 0.0; 296 PetscSparseDensePlusDot(sum, x, v, idx, n); 297 if (usecprow) { 298 z[ridx[i]] = sum; 299 } else { 300 z[i] = sum; 301 } 302 } 303 PetscCall(VecRestoreArrayRead(xx, &x)); 304 PetscCall(VecRestoreArrayWrite(zz, &z)); 305 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 306 PetscFunctionReturn(PETSC_SUCCESS); 307 } 308 309 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 310 { 311 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 312 PetscScalar *z = NULL, sum1, sum2, *zarray; 313 const PetscScalar *x, *xb; 314 PetscScalar x1, x2; 315 const MatScalar *v; 316 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 317 PetscBool usecprow = a->compressedrow.use; 318 319 PetscFunctionBegin; 320 PetscCall(VecGetArrayRead(xx, &x)); 321 PetscCall(VecGetArrayWrite(zz, &zarray)); 322 323 idx = a->j; 324 v = a->a; 325 if (usecprow) { 326 mbs = a->compressedrow.nrows; 327 ii = a->compressedrow.i; 328 ridx = a->compressedrow.rindex; 329 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 330 } else { 331 mbs = a->mbs; 332 ii = a->i; 333 z = zarray; 334 } 335 336 for (i = 0; i < mbs; i++) { 337 n = ii[1] - ii[0]; 338 ii++; 339 sum1 = 0.0; 340 sum2 = 0.0; 341 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 342 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 343 for (j = 0; j < n; j++) { 344 xb = x + 2 * (*idx++); 345 x1 = xb[0]; 346 x2 = xb[1]; 347 sum1 += v[0] * x1 + v[2] * x2; 348 sum2 += v[1] * x1 + v[3] * x2; 349 v += 4; 350 } 351 if (usecprow) z = zarray + 2 * ridx[i]; 352 z[0] = sum1; 353 z[1] = sum2; 354 if (!usecprow) z += 2; 355 } 356 PetscCall(VecRestoreArrayRead(xx, &x)); 357 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 358 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 359 PetscFunctionReturn(PETSC_SUCCESS); 360 } 361 362 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 363 { 364 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 365 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 366 const PetscScalar *x, *xb; 367 const MatScalar *v; 368 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 369 PetscBool usecprow = a->compressedrow.use; 370 371 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 372 #pragma disjoint(*v, *z, *xb) 373 #endif 374 375 PetscFunctionBegin; 376 PetscCall(VecGetArrayRead(xx, &x)); 377 PetscCall(VecGetArrayWrite(zz, &zarray)); 378 379 idx = a->j; 380 v = a->a; 381 if (usecprow) { 382 mbs = a->compressedrow.nrows; 383 ii = a->compressedrow.i; 384 ridx = a->compressedrow.rindex; 385 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 386 } else { 387 mbs = a->mbs; 388 ii = a->i; 389 z = zarray; 390 } 391 392 for (i = 0; i < mbs; i++) { 393 n = ii[1] - ii[0]; 394 ii++; 395 sum1 = 0.0; 396 sum2 = 0.0; 397 sum3 = 0.0; 398 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 399 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 400 for (j = 0; j < n; j++) { 401 xb = x + 3 * (*idx++); 402 x1 = xb[0]; 403 x2 = xb[1]; 404 x3 = xb[2]; 405 406 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 407 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 408 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 409 v += 9; 410 } 411 if (usecprow) z = zarray + 3 * ridx[i]; 412 z[0] = sum1; 413 z[1] = sum2; 414 z[2] = sum3; 415 if (!usecprow) z += 3; 416 } 417 PetscCall(VecRestoreArrayRead(xx, &x)); 418 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 419 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 420 PetscFunctionReturn(PETSC_SUCCESS); 421 } 422 423 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 424 { 425 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 426 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 427 const PetscScalar *x, *xb; 428 const MatScalar *v; 429 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 430 PetscBool usecprow = a->compressedrow.use; 431 432 PetscFunctionBegin; 433 PetscCall(VecGetArrayRead(xx, &x)); 434 PetscCall(VecGetArrayWrite(zz, &zarray)); 435 436 idx = a->j; 437 v = a->a; 438 if (usecprow) { 439 mbs = a->compressedrow.nrows; 440 ii = a->compressedrow.i; 441 ridx = a->compressedrow.rindex; 442 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 443 } else { 444 mbs = a->mbs; 445 ii = a->i; 446 z = zarray; 447 } 448 449 for (i = 0; i < mbs; i++) { 450 n = ii[1] - ii[0]; 451 ii++; 452 sum1 = 0.0; 453 sum2 = 0.0; 454 sum3 = 0.0; 455 sum4 = 0.0; 456 457 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 458 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 459 for (j = 0; j < n; j++) { 460 xb = x + 4 * (*idx++); 461 x1 = xb[0]; 462 x2 = xb[1]; 463 x3 = xb[2]; 464 x4 = xb[3]; 465 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 466 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 467 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 468 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 469 v += 16; 470 } 471 if (usecprow) z = zarray + 4 * ridx[i]; 472 z[0] = sum1; 473 z[1] = sum2; 474 z[2] = sum3; 475 z[3] = sum4; 476 if (!usecprow) z += 4; 477 } 478 PetscCall(VecRestoreArrayRead(xx, &x)); 479 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 480 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 481 PetscFunctionReturn(PETSC_SUCCESS); 482 } 483 484 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 485 { 486 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 487 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 488 const PetscScalar *xb, *x; 489 const MatScalar *v; 490 const PetscInt *idx, *ii, *ridx = NULL; 491 PetscInt mbs, i, j, n; 492 PetscBool usecprow = a->compressedrow.use; 493 494 PetscFunctionBegin; 495 PetscCall(VecGetArrayRead(xx, &x)); 496 PetscCall(VecGetArrayWrite(zz, &zarray)); 497 498 idx = a->j; 499 v = a->a; 500 if (usecprow) { 501 mbs = a->compressedrow.nrows; 502 ii = a->compressedrow.i; 503 ridx = a->compressedrow.rindex; 504 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 505 } else { 506 mbs = a->mbs; 507 ii = a->i; 508 z = zarray; 509 } 510 511 for (i = 0; i < mbs; i++) { 512 n = ii[1] - ii[0]; 513 ii++; 514 sum1 = 0.0; 515 sum2 = 0.0; 516 sum3 = 0.0; 517 sum4 = 0.0; 518 sum5 = 0.0; 519 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 520 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 521 for (j = 0; j < n; j++) { 522 xb = x + 5 * (*idx++); 523 x1 = xb[0]; 524 x2 = xb[1]; 525 x3 = xb[2]; 526 x4 = xb[3]; 527 x5 = xb[4]; 528 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 529 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 530 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 531 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 532 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 533 v += 25; 534 } 535 if (usecprow) z = zarray + 5 * ridx[i]; 536 z[0] = sum1; 537 z[1] = sum2; 538 z[2] = sum3; 539 z[3] = sum4; 540 z[4] = sum5; 541 if (!usecprow) z += 5; 542 } 543 PetscCall(VecRestoreArrayRead(xx, &x)); 544 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 545 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 546 PetscFunctionReturn(PETSC_SUCCESS); 547 } 548 549 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 550 { 551 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 552 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 553 const PetscScalar *x, *xb; 554 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 555 const MatScalar *v; 556 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 557 PetscBool usecprow = a->compressedrow.use; 558 559 PetscFunctionBegin; 560 PetscCall(VecGetArrayRead(xx, &x)); 561 PetscCall(VecGetArrayWrite(zz, &zarray)); 562 563 idx = a->j; 564 v = a->a; 565 if (usecprow) { 566 mbs = a->compressedrow.nrows; 567 ii = a->compressedrow.i; 568 ridx = a->compressedrow.rindex; 569 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 570 } else { 571 mbs = a->mbs; 572 ii = a->i; 573 z = zarray; 574 } 575 576 for (i = 0; i < mbs; i++) { 577 n = ii[1] - ii[0]; 578 ii++; 579 sum1 = 0.0; 580 sum2 = 0.0; 581 sum3 = 0.0; 582 sum4 = 0.0; 583 sum5 = 0.0; 584 sum6 = 0.0; 585 586 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 587 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 588 for (j = 0; j < n; j++) { 589 xb = x + 6 * (*idx++); 590 x1 = xb[0]; 591 x2 = xb[1]; 592 x3 = xb[2]; 593 x4 = xb[3]; 594 x5 = xb[4]; 595 x6 = xb[5]; 596 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 597 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 598 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 599 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 600 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 601 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 602 v += 36; 603 } 604 if (usecprow) z = zarray + 6 * ridx[i]; 605 z[0] = sum1; 606 z[1] = sum2; 607 z[2] = sum3; 608 z[3] = sum4; 609 z[4] = sum5; 610 z[5] = sum6; 611 if (!usecprow) z += 6; 612 } 613 614 PetscCall(VecRestoreArrayRead(xx, &x)); 615 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 616 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 621 { 622 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 623 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 624 const PetscScalar *x, *xb; 625 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 626 const MatScalar *v; 627 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 628 PetscBool usecprow = a->compressedrow.use; 629 630 PetscFunctionBegin; 631 PetscCall(VecGetArrayRead(xx, &x)); 632 PetscCall(VecGetArrayWrite(zz, &zarray)); 633 634 idx = a->j; 635 v = a->a; 636 if (usecprow) { 637 mbs = a->compressedrow.nrows; 638 ii = a->compressedrow.i; 639 ridx = a->compressedrow.rindex; 640 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 641 } else { 642 mbs = a->mbs; 643 ii = a->i; 644 z = zarray; 645 } 646 647 for (i = 0; i < mbs; i++) { 648 n = ii[1] - ii[0]; 649 ii++; 650 sum1 = 0.0; 651 sum2 = 0.0; 652 sum3 = 0.0; 653 sum4 = 0.0; 654 sum5 = 0.0; 655 sum6 = 0.0; 656 sum7 = 0.0; 657 658 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 659 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 660 for (j = 0; j < n; j++) { 661 xb = x + 7 * (*idx++); 662 x1 = xb[0]; 663 x2 = xb[1]; 664 x3 = xb[2]; 665 x4 = xb[3]; 666 x5 = xb[4]; 667 x6 = xb[5]; 668 x7 = xb[6]; 669 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 670 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 671 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 672 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 673 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 674 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 675 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 676 v += 49; 677 } 678 if (usecprow) z = zarray + 7 * ridx[i]; 679 z[0] = sum1; 680 z[1] = sum2; 681 z[2] = sum3; 682 z[3] = sum4; 683 z[4] = sum5; 684 z[5] = sum6; 685 z[6] = sum7; 686 if (!usecprow) z += 7; 687 } 688 689 PetscCall(VecRestoreArrayRead(xx, &x)); 690 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 691 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 692 PetscFunctionReturn(PETSC_SUCCESS); 693 } 694 695 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 696 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 697 { 698 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 699 PetscScalar *z = NULL, *work, *workt, *zarray; 700 const PetscScalar *x, *xb; 701 const MatScalar *v; 702 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 703 const PetscInt *idx, *ii, *ridx = NULL; 704 PetscInt k; 705 PetscBool usecprow = a->compressedrow.use; 706 707 __m256d a0, a1, a2, a3, a4, a5; 708 __m256d w0, w1, w2, w3; 709 __m256d z0, z1, z2; 710 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 711 712 PetscFunctionBegin; 713 PetscCall(VecGetArrayRead(xx, &x)); 714 PetscCall(VecGetArrayWrite(zz, &zarray)); 715 716 idx = a->j; 717 v = a->a; 718 if (usecprow) { 719 mbs = a->compressedrow.nrows; 720 ii = a->compressedrow.i; 721 ridx = a->compressedrow.rindex; 722 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 723 } else { 724 mbs = a->mbs; 725 ii = a->i; 726 z = zarray; 727 } 728 729 if (!a->mult_work) { 730 k = PetscMax(A->rmap->n, A->cmap->n); 731 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 732 } 733 734 work = a->mult_work; 735 for (i = 0; i < mbs; i++) { 736 n = ii[1] - ii[0]; 737 ii++; 738 workt = work; 739 for (j = 0; j < n; j++) { 740 xb = x + bs * (*idx++); 741 for (k = 0; k < bs; k++) workt[k] = xb[k]; 742 workt += bs; 743 } 744 if (usecprow) z = zarray + bs * ridx[i]; 745 746 z0 = _mm256_setzero_pd(); 747 z1 = _mm256_setzero_pd(); 748 z2 = _mm256_setzero_pd(); 749 750 for (j = 0; j < n; j++) { 751 /* first column of a */ 752 w0 = _mm256_set1_pd(work[j * 9]); 753 a0 = _mm256_loadu_pd(&v[j * 81]); 754 z0 = _mm256_fmadd_pd(a0, w0, z0); 755 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 756 z1 = _mm256_fmadd_pd(a1, w0, z1); 757 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 758 z2 = _mm256_fmadd_pd(a2, w0, z2); 759 760 /* second column of a */ 761 w1 = _mm256_set1_pd(work[j * 9 + 1]); 762 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 763 z0 = _mm256_fmadd_pd(a0, w1, z0); 764 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 765 z1 = _mm256_fmadd_pd(a1, w1, z1); 766 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 767 z2 = _mm256_fmadd_pd(a2, w1, z2); 768 769 /* third column of a */ 770 w2 = _mm256_set1_pd(work[j * 9 + 2]); 771 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 772 z0 = _mm256_fmadd_pd(a3, w2, z0); 773 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 774 z1 = _mm256_fmadd_pd(a4, w2, z1); 775 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 776 z2 = _mm256_fmadd_pd(a5, w2, z2); 777 778 /* fourth column of a */ 779 w3 = _mm256_set1_pd(work[j * 9 + 3]); 780 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 781 z0 = _mm256_fmadd_pd(a0, w3, z0); 782 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 783 z1 = _mm256_fmadd_pd(a1, w3, z1); 784 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 785 z2 = _mm256_fmadd_pd(a2, w3, z2); 786 787 /* fifth column of a */ 788 w0 = _mm256_set1_pd(work[j * 9 + 4]); 789 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 790 z0 = _mm256_fmadd_pd(a3, w0, z0); 791 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 792 z1 = _mm256_fmadd_pd(a4, w0, z1); 793 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 794 z2 = _mm256_fmadd_pd(a5, w0, z2); 795 796 /* sixth column of a */ 797 w1 = _mm256_set1_pd(work[j * 9 + 5]); 798 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 799 z0 = _mm256_fmadd_pd(a0, w1, z0); 800 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 801 z1 = _mm256_fmadd_pd(a1, w1, z1); 802 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 803 z2 = _mm256_fmadd_pd(a2, w1, z2); 804 805 /* seventh column of a */ 806 w2 = _mm256_set1_pd(work[j * 9 + 6]); 807 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 808 z0 = _mm256_fmadd_pd(a0, w2, z0); 809 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 810 z1 = _mm256_fmadd_pd(a1, w2, z1); 811 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 812 z2 = _mm256_fmadd_pd(a2, w2, z2); 813 814 /* eighth column of a */ 815 w3 = _mm256_set1_pd(work[j * 9 + 7]); 816 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 817 z0 = _mm256_fmadd_pd(a3, w3, z0); 818 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 819 z1 = _mm256_fmadd_pd(a4, w3, z1); 820 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 821 z2 = _mm256_fmadd_pd(a5, w3, z2); 822 823 /* ninth column of a */ 824 w0 = _mm256_set1_pd(work[j * 9 + 8]); 825 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 826 z0 = _mm256_fmadd_pd(a0, w0, z0); 827 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 828 z1 = _mm256_fmadd_pd(a1, w0, z1); 829 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 830 z2 = _mm256_fmadd_pd(a2, w0, z2); 831 } 832 833 _mm256_storeu_pd(&z[0], z0); 834 _mm256_storeu_pd(&z[4], z1); 835 _mm256_maskstore_pd(&z[8], mask1, z2); 836 837 v += n * bs2; 838 if (!usecprow) z += bs; 839 } 840 PetscCall(VecRestoreArrayRead(xx, &x)); 841 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 842 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 843 PetscFunctionReturn(PETSC_SUCCESS); 844 } 845 #endif 846 847 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 848 { 849 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 850 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 851 const PetscScalar *x, *xb; 852 PetscScalar *zarray, xv; 853 const MatScalar *v; 854 const PetscInt *ii, *ij = a->j, *idx; 855 PetscInt mbs, i, j, k, n, *ridx = NULL; 856 PetscBool usecprow = a->compressedrow.use; 857 858 PetscFunctionBegin; 859 PetscCall(VecGetArrayRead(xx, &x)); 860 PetscCall(VecGetArrayWrite(zz, &zarray)); 861 862 v = a->a; 863 if (usecprow) { 864 mbs = a->compressedrow.nrows; 865 ii = a->compressedrow.i; 866 ridx = a->compressedrow.rindex; 867 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 868 } else { 869 mbs = a->mbs; 870 ii = a->i; 871 z = zarray; 872 } 873 874 for (i = 0; i < mbs; i++) { 875 n = ii[i + 1] - ii[i]; 876 idx = ij + ii[i]; 877 sum1 = 0.0; 878 sum2 = 0.0; 879 sum3 = 0.0; 880 sum4 = 0.0; 881 sum5 = 0.0; 882 sum6 = 0.0; 883 sum7 = 0.0; 884 sum8 = 0.0; 885 sum9 = 0.0; 886 sum10 = 0.0; 887 sum11 = 0.0; 888 889 for (j = 0; j < n; j++) { 890 xb = x + 11 * (idx[j]); 891 892 for (k = 0; k < 11; k++) { 893 xv = xb[k]; 894 sum1 += v[0] * xv; 895 sum2 += v[1] * xv; 896 sum3 += v[2] * xv; 897 sum4 += v[3] * xv; 898 sum5 += v[4] * xv; 899 sum6 += v[5] * xv; 900 sum7 += v[6] * xv; 901 sum8 += v[7] * xv; 902 sum9 += v[8] * xv; 903 sum10 += v[9] * xv; 904 sum11 += v[10] * xv; 905 v += 11; 906 } 907 } 908 if (usecprow) z = zarray + 11 * ridx[i]; 909 z[0] = sum1; 910 z[1] = sum2; 911 z[2] = sum3; 912 z[3] = sum4; 913 z[4] = sum5; 914 z[5] = sum6; 915 z[6] = sum7; 916 z[7] = sum8; 917 z[8] = sum9; 918 z[9] = sum10; 919 z[10] = sum11; 920 921 if (!usecprow) z += 11; 922 } 923 924 PetscCall(VecRestoreArrayRead(xx, &x)); 925 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 926 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 927 PetscFunctionReturn(PETSC_SUCCESS); 928 } 929 930 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 931 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 932 { 933 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 934 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 935 const PetscScalar *x, *xb; 936 PetscScalar *zarray, xv; 937 const MatScalar *v; 938 const PetscInt *ii, *ij = a->j, *idx; 939 PetscInt mbs, i, j, k, n, *ridx = NULL; 940 PetscBool usecprow = a->compressedrow.use; 941 942 PetscFunctionBegin; 943 PetscCall(VecGetArrayRead(xx, &x)); 944 PetscCall(VecGetArrayWrite(zz, &zarray)); 945 946 v = a->a; 947 if (usecprow) { 948 mbs = a->compressedrow.nrows; 949 ii = a->compressedrow.i; 950 ridx = a->compressedrow.rindex; 951 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 952 } else { 953 mbs = a->mbs; 954 ii = a->i; 955 z = zarray; 956 } 957 958 for (i = 0; i < mbs; i++) { 959 n = ii[i + 1] - ii[i]; 960 idx = ij + ii[i]; 961 sum1 = 0.0; 962 sum2 = 0.0; 963 sum3 = 0.0; 964 sum4 = 0.0; 965 sum5 = 0.0; 966 sum6 = 0.0; 967 sum7 = 0.0; 968 sum8 = 0.0; 969 sum9 = 0.0; 970 sum10 = 0.0; 971 sum11 = 0.0; 972 sum12 = 0.0; 973 974 for (j = 0; j < n; j++) { 975 xb = x + 12 * (idx[j]); 976 977 for (k = 0; k < 12; k++) { 978 xv = xb[k]; 979 sum1 += v[0] * xv; 980 sum2 += v[1] * xv; 981 sum3 += v[2] * xv; 982 sum4 += v[3] * xv; 983 sum5 += v[4] * xv; 984 sum6 += v[5] * xv; 985 sum7 += v[6] * xv; 986 sum8 += v[7] * xv; 987 sum9 += v[8] * xv; 988 sum10 += v[9] * xv; 989 sum11 += v[10] * xv; 990 sum12 += v[11] * xv; 991 v += 12; 992 } 993 } 994 if (usecprow) z = zarray + 12 * ridx[i]; 995 z[0] = sum1; 996 z[1] = sum2; 997 z[2] = sum3; 998 z[3] = sum4; 999 z[4] = sum5; 1000 z[5] = sum6; 1001 z[6] = sum7; 1002 z[7] = sum8; 1003 z[8] = sum9; 1004 z[9] = sum10; 1005 z[10] = sum11; 1006 z[11] = sum12; 1007 if (!usecprow) z += 12; 1008 } 1009 PetscCall(VecRestoreArrayRead(xx, &x)); 1010 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1011 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1012 PetscFunctionReturn(PETSC_SUCCESS); 1013 } 1014 1015 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 1016 { 1017 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1018 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1019 const PetscScalar *x, *xb; 1020 PetscScalar *zarray, *yarray, xv; 1021 const MatScalar *v; 1022 const PetscInt *ii, *ij = a->j, *idx; 1023 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 1024 PetscBool usecprow = a->compressedrow.use; 1025 1026 PetscFunctionBegin; 1027 PetscCall(VecGetArrayRead(xx, &x)); 1028 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1029 1030 v = a->a; 1031 if (usecprow) { 1032 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1033 mbs = a->compressedrow.nrows; 1034 ii = a->compressedrow.i; 1035 ridx = a->compressedrow.rindex; 1036 } else { 1037 ii = a->i; 1038 y = yarray; 1039 z = zarray; 1040 } 1041 1042 for (i = 0; i < mbs; i++) { 1043 n = ii[i + 1] - ii[i]; 1044 idx = ij + ii[i]; 1045 1046 if (usecprow) { 1047 y = yarray + 12 * ridx[i]; 1048 z = zarray + 12 * ridx[i]; 1049 } 1050 sum1 = y[0]; 1051 sum2 = y[1]; 1052 sum3 = y[2]; 1053 sum4 = y[3]; 1054 sum5 = y[4]; 1055 sum6 = y[5]; 1056 sum7 = y[6]; 1057 sum8 = y[7]; 1058 sum9 = y[8]; 1059 sum10 = y[9]; 1060 sum11 = y[10]; 1061 sum12 = y[11]; 1062 1063 for (j = 0; j < n; j++) { 1064 xb = x + 12 * (idx[j]); 1065 1066 for (k = 0; k < 12; k++) { 1067 xv = xb[k]; 1068 sum1 += v[0] * xv; 1069 sum2 += v[1] * xv; 1070 sum3 += v[2] * xv; 1071 sum4 += v[3] * xv; 1072 sum5 += v[4] * xv; 1073 sum6 += v[5] * xv; 1074 sum7 += v[6] * xv; 1075 sum8 += v[7] * xv; 1076 sum9 += v[8] * xv; 1077 sum10 += v[9] * xv; 1078 sum11 += v[10] * xv; 1079 sum12 += v[11] * xv; 1080 v += 12; 1081 } 1082 } 1083 1084 z[0] = sum1; 1085 z[1] = sum2; 1086 z[2] = sum3; 1087 z[3] = sum4; 1088 z[4] = sum5; 1089 z[5] = sum6; 1090 z[6] = sum7; 1091 z[7] = sum8; 1092 z[8] = sum9; 1093 z[9] = sum10; 1094 z[10] = sum11; 1095 z[11] = sum12; 1096 if (!usecprow) { 1097 y += 12; 1098 z += 12; 1099 } 1100 } 1101 PetscCall(VecRestoreArrayRead(xx, &x)); 1102 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1103 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1104 PetscFunctionReturn(PETSC_SUCCESS); 1105 } 1106 1107 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1108 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1109 { 1110 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1111 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1112 const PetscScalar *x, *xb; 1113 PetscScalar x1, x2, x3, x4, *zarray; 1114 const MatScalar *v; 1115 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1116 PetscInt mbs, i, j, n; 1117 PetscBool usecprow = a->compressedrow.use; 1118 1119 PetscFunctionBegin; 1120 PetscCall(VecGetArrayRead(xx, &x)); 1121 PetscCall(VecGetArrayWrite(zz, &zarray)); 1122 1123 v = a->a; 1124 if (usecprow) { 1125 mbs = a->compressedrow.nrows; 1126 ii = a->compressedrow.i; 1127 ridx = a->compressedrow.rindex; 1128 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1129 } else { 1130 mbs = a->mbs; 1131 ii = a->i; 1132 z = zarray; 1133 } 1134 1135 for (i = 0; i < mbs; i++) { 1136 n = ii[i + 1] - ii[i]; 1137 idx = ij + ii[i]; 1138 1139 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1140 for (j = 0; j < n; j++) { 1141 xb = x + 12 * (idx[j]); 1142 x1 = xb[0]; 1143 x2 = xb[1]; 1144 x3 = xb[2]; 1145 x4 = xb[3]; 1146 1147 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1148 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1149 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1150 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1151 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1152 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1153 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1154 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1155 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1156 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1157 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1158 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1159 v += 48; 1160 1161 x1 = xb[4]; 1162 x2 = xb[5]; 1163 x3 = xb[6]; 1164 x4 = xb[7]; 1165 1166 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1167 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1168 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1169 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1170 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1171 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1172 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1173 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1174 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1175 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1176 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1177 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1178 v += 48; 1179 1180 x1 = xb[8]; 1181 x2 = xb[9]; 1182 x3 = xb[10]; 1183 x4 = xb[11]; 1184 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1185 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1186 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1187 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1188 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1189 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1190 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1191 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1192 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1193 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1194 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1195 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1196 v += 48; 1197 } 1198 if (usecprow) z = zarray + 12 * ridx[i]; 1199 z[0] = sum1; 1200 z[1] = sum2; 1201 z[2] = sum3; 1202 z[3] = sum4; 1203 z[4] = sum5; 1204 z[5] = sum6; 1205 z[6] = sum7; 1206 z[7] = sum8; 1207 z[8] = sum9; 1208 z[9] = sum10; 1209 z[10] = sum11; 1210 z[11] = sum12; 1211 if (!usecprow) z += 12; 1212 } 1213 PetscCall(VecRestoreArrayRead(xx, &x)); 1214 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1215 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1220 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1221 { 1222 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1223 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1224 const PetscScalar *x, *xb; 1225 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1226 const MatScalar *v; 1227 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1228 PetscInt mbs = a->mbs, i, j, n; 1229 PetscBool usecprow = a->compressedrow.use; 1230 1231 PetscFunctionBegin; 1232 PetscCall(VecGetArrayRead(xx, &x)); 1233 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1234 1235 v = a->a; 1236 if (usecprow) { 1237 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1238 mbs = a->compressedrow.nrows; 1239 ii = a->compressedrow.i; 1240 ridx = a->compressedrow.rindex; 1241 } else { 1242 ii = a->i; 1243 y = yarray; 1244 z = zarray; 1245 } 1246 1247 for (i = 0; i < mbs; i++) { 1248 n = ii[i + 1] - ii[i]; 1249 idx = ij + ii[i]; 1250 1251 if (usecprow) { 1252 y = yarray + 12 * ridx[i]; 1253 z = zarray + 12 * ridx[i]; 1254 } 1255 sum1 = y[0]; 1256 sum2 = y[1]; 1257 sum3 = y[2]; 1258 sum4 = y[3]; 1259 sum5 = y[4]; 1260 sum6 = y[5]; 1261 sum7 = y[6]; 1262 sum8 = y[7]; 1263 sum9 = y[8]; 1264 sum10 = y[9]; 1265 sum11 = y[10]; 1266 sum12 = y[11]; 1267 1268 for (j = 0; j < n; j++) { 1269 xb = x + 12 * (idx[j]); 1270 x1 = xb[0]; 1271 x2 = xb[1]; 1272 x3 = xb[2]; 1273 x4 = xb[3]; 1274 1275 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1276 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1277 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1278 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1279 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1280 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1281 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1282 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1283 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1284 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1285 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1286 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1287 v += 48; 1288 1289 x1 = xb[4]; 1290 x2 = xb[5]; 1291 x3 = xb[6]; 1292 x4 = xb[7]; 1293 1294 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1295 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1296 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1297 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1298 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1299 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1300 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1301 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1302 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1303 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1304 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1305 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1306 v += 48; 1307 1308 x1 = xb[8]; 1309 x2 = xb[9]; 1310 x3 = xb[10]; 1311 x4 = xb[11]; 1312 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1313 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1314 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1315 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1316 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1317 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1318 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1319 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1320 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1321 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1322 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1323 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1324 v += 48; 1325 } 1326 z[0] = sum1; 1327 z[1] = sum2; 1328 z[2] = sum3; 1329 z[3] = sum4; 1330 z[4] = sum5; 1331 z[5] = sum6; 1332 z[6] = sum7; 1333 z[7] = sum8; 1334 z[8] = sum9; 1335 z[9] = sum10; 1336 z[10] = sum11; 1337 z[11] = sum12; 1338 if (!usecprow) { 1339 y += 12; 1340 z += 12; 1341 } 1342 } 1343 PetscCall(VecRestoreArrayRead(xx, &x)); 1344 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1345 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1346 PetscFunctionReturn(PETSC_SUCCESS); 1347 } 1348 1349 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1350 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1351 { 1352 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1353 PetscScalar *z = NULL, *zarray; 1354 const PetscScalar *x, *work; 1355 const MatScalar *v = a->a; 1356 PetscInt mbs, i, j, n; 1357 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1358 PetscBool usecprow = a->compressedrow.use; 1359 const PetscInt bs = 12, bs2 = 144; 1360 1361 __m256d a0, a1, a2, a3, a4, a5; 1362 __m256d w0, w1, w2, w3; 1363 __m256d z0, z1, z2; 1364 1365 PetscFunctionBegin; 1366 PetscCall(VecGetArrayRead(xx, &x)); 1367 PetscCall(VecGetArrayWrite(zz, &zarray)); 1368 1369 if (usecprow) { 1370 mbs = a->compressedrow.nrows; 1371 ii = a->compressedrow.i; 1372 ridx = a->compressedrow.rindex; 1373 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1374 } else { 1375 mbs = a->mbs; 1376 ii = a->i; 1377 z = zarray; 1378 } 1379 1380 for (i = 0; i < mbs; i++) { 1381 z0 = _mm256_setzero_pd(); 1382 z1 = _mm256_setzero_pd(); 1383 z2 = _mm256_setzero_pd(); 1384 1385 n = ii[1] - ii[0]; 1386 ii++; 1387 for (j = 0; j < n; j++) { 1388 work = x + bs * (*idx++); 1389 1390 /* first column of a */ 1391 w0 = _mm256_set1_pd(work[0]); 1392 a0 = _mm256_loadu_pd(v + 0); 1393 z0 = _mm256_fmadd_pd(a0, w0, z0); 1394 a1 = _mm256_loadu_pd(v + 4); 1395 z1 = _mm256_fmadd_pd(a1, w0, z1); 1396 a2 = _mm256_loadu_pd(v + 8); 1397 z2 = _mm256_fmadd_pd(a2, w0, z2); 1398 1399 /* second column of a */ 1400 w1 = _mm256_set1_pd(work[1]); 1401 a3 = _mm256_loadu_pd(v + 12); 1402 z0 = _mm256_fmadd_pd(a3, w1, z0); 1403 a4 = _mm256_loadu_pd(v + 16); 1404 z1 = _mm256_fmadd_pd(a4, w1, z1); 1405 a5 = _mm256_loadu_pd(v + 20); 1406 z2 = _mm256_fmadd_pd(a5, w1, z2); 1407 1408 /* third column of a */ 1409 w2 = _mm256_set1_pd(work[2]); 1410 a0 = _mm256_loadu_pd(v + 24); 1411 z0 = _mm256_fmadd_pd(a0, w2, z0); 1412 a1 = _mm256_loadu_pd(v + 28); 1413 z1 = _mm256_fmadd_pd(a1, w2, z1); 1414 a2 = _mm256_loadu_pd(v + 32); 1415 z2 = _mm256_fmadd_pd(a2, w2, z2); 1416 1417 /* fourth column of a */ 1418 w3 = _mm256_set1_pd(work[3]); 1419 a3 = _mm256_loadu_pd(v + 36); 1420 z0 = _mm256_fmadd_pd(a3, w3, z0); 1421 a4 = _mm256_loadu_pd(v + 40); 1422 z1 = _mm256_fmadd_pd(a4, w3, z1); 1423 a5 = _mm256_loadu_pd(v + 44); 1424 z2 = _mm256_fmadd_pd(a5, w3, z2); 1425 1426 /* fifth column of a */ 1427 w0 = _mm256_set1_pd(work[4]); 1428 a0 = _mm256_loadu_pd(v + 48); 1429 z0 = _mm256_fmadd_pd(a0, w0, z0); 1430 a1 = _mm256_loadu_pd(v + 52); 1431 z1 = _mm256_fmadd_pd(a1, w0, z1); 1432 a2 = _mm256_loadu_pd(v + 56); 1433 z2 = _mm256_fmadd_pd(a2, w0, z2); 1434 1435 /* sixth column of a */ 1436 w1 = _mm256_set1_pd(work[5]); 1437 a3 = _mm256_loadu_pd(v + 60); 1438 z0 = _mm256_fmadd_pd(a3, w1, z0); 1439 a4 = _mm256_loadu_pd(v + 64); 1440 z1 = _mm256_fmadd_pd(a4, w1, z1); 1441 a5 = _mm256_loadu_pd(v + 68); 1442 z2 = _mm256_fmadd_pd(a5, w1, z2); 1443 1444 /* seventh column of a */ 1445 w2 = _mm256_set1_pd(work[6]); 1446 a0 = _mm256_loadu_pd(v + 72); 1447 z0 = _mm256_fmadd_pd(a0, w2, z0); 1448 a1 = _mm256_loadu_pd(v + 76); 1449 z1 = _mm256_fmadd_pd(a1, w2, z1); 1450 a2 = _mm256_loadu_pd(v + 80); 1451 z2 = _mm256_fmadd_pd(a2, w2, z2); 1452 1453 /* eighth column of a */ 1454 w3 = _mm256_set1_pd(work[7]); 1455 a3 = _mm256_loadu_pd(v + 84); 1456 z0 = _mm256_fmadd_pd(a3, w3, z0); 1457 a4 = _mm256_loadu_pd(v + 88); 1458 z1 = _mm256_fmadd_pd(a4, w3, z1); 1459 a5 = _mm256_loadu_pd(v + 92); 1460 z2 = _mm256_fmadd_pd(a5, w3, z2); 1461 1462 /* ninth column of a */ 1463 w0 = _mm256_set1_pd(work[8]); 1464 a0 = _mm256_loadu_pd(v + 96); 1465 z0 = _mm256_fmadd_pd(a0, w0, z0); 1466 a1 = _mm256_loadu_pd(v + 100); 1467 z1 = _mm256_fmadd_pd(a1, w0, z1); 1468 a2 = _mm256_loadu_pd(v + 104); 1469 z2 = _mm256_fmadd_pd(a2, w0, z2); 1470 1471 /* tenth column of a */ 1472 w1 = _mm256_set1_pd(work[9]); 1473 a3 = _mm256_loadu_pd(v + 108); 1474 z0 = _mm256_fmadd_pd(a3, w1, z0); 1475 a4 = _mm256_loadu_pd(v + 112); 1476 z1 = _mm256_fmadd_pd(a4, w1, z1); 1477 a5 = _mm256_loadu_pd(v + 116); 1478 z2 = _mm256_fmadd_pd(a5, w1, z2); 1479 1480 /* eleventh column of a */ 1481 w2 = _mm256_set1_pd(work[10]); 1482 a0 = _mm256_loadu_pd(v + 120); 1483 z0 = _mm256_fmadd_pd(a0, w2, z0); 1484 a1 = _mm256_loadu_pd(v + 124); 1485 z1 = _mm256_fmadd_pd(a1, w2, z1); 1486 a2 = _mm256_loadu_pd(v + 128); 1487 z2 = _mm256_fmadd_pd(a2, w2, z2); 1488 1489 /* twelveth column of a */ 1490 w3 = _mm256_set1_pd(work[11]); 1491 a3 = _mm256_loadu_pd(v + 132); 1492 z0 = _mm256_fmadd_pd(a3, w3, z0); 1493 a4 = _mm256_loadu_pd(v + 136); 1494 z1 = _mm256_fmadd_pd(a4, w3, z1); 1495 a5 = _mm256_loadu_pd(v + 140); 1496 z2 = _mm256_fmadd_pd(a5, w3, z2); 1497 1498 v += bs2; 1499 } 1500 if (usecprow) z = zarray + bs * ridx[i]; 1501 _mm256_storeu_pd(&z[0], z0); 1502 _mm256_storeu_pd(&z[4], z1); 1503 _mm256_storeu_pd(&z[8], z2); 1504 if (!usecprow) z += bs; 1505 } 1506 PetscCall(VecRestoreArrayRead(xx, &x)); 1507 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1508 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1509 PetscFunctionReturn(PETSC_SUCCESS); 1510 } 1511 #endif 1512 1513 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1514 /* Default MatMult for block size 15 */ 1515 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1516 { 1517 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1518 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1519 const PetscScalar *x, *xb; 1520 PetscScalar *zarray, xv; 1521 const MatScalar *v; 1522 const PetscInt *ii, *ij = a->j, *idx; 1523 PetscInt mbs, i, j, k, n, *ridx = NULL; 1524 PetscBool usecprow = a->compressedrow.use; 1525 1526 PetscFunctionBegin; 1527 PetscCall(VecGetArrayRead(xx, &x)); 1528 PetscCall(VecGetArrayWrite(zz, &zarray)); 1529 1530 v = a->a; 1531 if (usecprow) { 1532 mbs = a->compressedrow.nrows; 1533 ii = a->compressedrow.i; 1534 ridx = a->compressedrow.rindex; 1535 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1536 } else { 1537 mbs = a->mbs; 1538 ii = a->i; 1539 z = zarray; 1540 } 1541 1542 for (i = 0; i < mbs; i++) { 1543 n = ii[i + 1] - ii[i]; 1544 idx = ij + ii[i]; 1545 sum1 = 0.0; 1546 sum2 = 0.0; 1547 sum3 = 0.0; 1548 sum4 = 0.0; 1549 sum5 = 0.0; 1550 sum6 = 0.0; 1551 sum7 = 0.0; 1552 sum8 = 0.0; 1553 sum9 = 0.0; 1554 sum10 = 0.0; 1555 sum11 = 0.0; 1556 sum12 = 0.0; 1557 sum13 = 0.0; 1558 sum14 = 0.0; 1559 sum15 = 0.0; 1560 1561 for (j = 0; j < n; j++) { 1562 xb = x + 15 * (idx[j]); 1563 1564 for (k = 0; k < 15; k++) { 1565 xv = xb[k]; 1566 sum1 += v[0] * xv; 1567 sum2 += v[1] * xv; 1568 sum3 += v[2] * xv; 1569 sum4 += v[3] * xv; 1570 sum5 += v[4] * xv; 1571 sum6 += v[5] * xv; 1572 sum7 += v[6] * xv; 1573 sum8 += v[7] * xv; 1574 sum9 += v[8] * xv; 1575 sum10 += v[9] * xv; 1576 sum11 += v[10] * xv; 1577 sum12 += v[11] * xv; 1578 sum13 += v[12] * xv; 1579 sum14 += v[13] * xv; 1580 sum15 += v[14] * xv; 1581 v += 15; 1582 } 1583 } 1584 if (usecprow) z = zarray + 15 * ridx[i]; 1585 z[0] = sum1; 1586 z[1] = sum2; 1587 z[2] = sum3; 1588 z[3] = sum4; 1589 z[4] = sum5; 1590 z[5] = sum6; 1591 z[6] = sum7; 1592 z[7] = sum8; 1593 z[8] = sum9; 1594 z[9] = sum10; 1595 z[10] = sum11; 1596 z[11] = sum12; 1597 z[12] = sum13; 1598 z[13] = sum14; 1599 z[14] = sum15; 1600 1601 if (!usecprow) z += 15; 1602 } 1603 1604 PetscCall(VecRestoreArrayRead(xx, &x)); 1605 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1606 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1611 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1612 { 1613 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1614 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1615 const PetscScalar *x, *xb; 1616 PetscScalar x1, x2, x3, x4, *zarray; 1617 const MatScalar *v; 1618 const PetscInt *ii, *ij = a->j, *idx; 1619 PetscInt mbs, i, j, n, *ridx = NULL; 1620 PetscBool usecprow = a->compressedrow.use; 1621 1622 PetscFunctionBegin; 1623 PetscCall(VecGetArrayRead(xx, &x)); 1624 PetscCall(VecGetArrayWrite(zz, &zarray)); 1625 1626 v = a->a; 1627 if (usecprow) { 1628 mbs = a->compressedrow.nrows; 1629 ii = a->compressedrow.i; 1630 ridx = a->compressedrow.rindex; 1631 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1632 } else { 1633 mbs = a->mbs; 1634 ii = a->i; 1635 z = zarray; 1636 } 1637 1638 for (i = 0; i < mbs; i++) { 1639 n = ii[i + 1] - ii[i]; 1640 idx = ij + ii[i]; 1641 sum1 = 0.0; 1642 sum2 = 0.0; 1643 sum3 = 0.0; 1644 sum4 = 0.0; 1645 sum5 = 0.0; 1646 sum6 = 0.0; 1647 sum7 = 0.0; 1648 sum8 = 0.0; 1649 sum9 = 0.0; 1650 sum10 = 0.0; 1651 sum11 = 0.0; 1652 sum12 = 0.0; 1653 sum13 = 0.0; 1654 sum14 = 0.0; 1655 sum15 = 0.0; 1656 1657 for (j = 0; j < n; j++) { 1658 xb = x + 15 * (idx[j]); 1659 x1 = xb[0]; 1660 x2 = xb[1]; 1661 x3 = xb[2]; 1662 x4 = xb[3]; 1663 1664 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1665 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1666 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1667 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1668 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1669 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1670 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1671 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1672 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1673 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1674 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1675 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1676 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1677 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1678 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1679 1680 v += 60; 1681 1682 x1 = xb[4]; 1683 x2 = xb[5]; 1684 x3 = xb[6]; 1685 x4 = xb[7]; 1686 1687 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1688 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1689 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1690 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1691 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1692 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1693 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1694 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1695 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1696 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1697 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1698 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1699 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1700 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1701 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1702 v += 60; 1703 1704 x1 = xb[8]; 1705 x2 = xb[9]; 1706 x3 = xb[10]; 1707 x4 = xb[11]; 1708 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1709 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1710 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1711 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1712 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1713 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1714 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1715 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1716 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1717 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1718 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1719 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1720 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1721 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1722 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1723 v += 60; 1724 1725 x1 = xb[12]; 1726 x2 = xb[13]; 1727 x3 = xb[14]; 1728 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1729 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1730 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1731 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1732 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1733 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1734 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1735 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1736 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1737 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1738 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1739 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1740 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1741 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1742 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1743 v += 45; 1744 } 1745 if (usecprow) z = zarray + 15 * ridx[i]; 1746 z[0] = sum1; 1747 z[1] = sum2; 1748 z[2] = sum3; 1749 z[3] = sum4; 1750 z[4] = sum5; 1751 z[5] = sum6; 1752 z[6] = sum7; 1753 z[7] = sum8; 1754 z[8] = sum9; 1755 z[9] = sum10; 1756 z[10] = sum11; 1757 z[11] = sum12; 1758 z[12] = sum13; 1759 z[13] = sum14; 1760 z[14] = sum15; 1761 1762 if (!usecprow) z += 15; 1763 } 1764 1765 PetscCall(VecRestoreArrayRead(xx, &x)); 1766 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1767 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1768 PetscFunctionReturn(PETSC_SUCCESS); 1769 } 1770 1771 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1772 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1773 { 1774 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1775 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1776 const PetscScalar *x, *xb; 1777 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1778 const MatScalar *v; 1779 const PetscInt *ii, *ij = a->j, *idx; 1780 PetscInt mbs, i, j, n, *ridx = NULL; 1781 PetscBool usecprow = a->compressedrow.use; 1782 1783 PetscFunctionBegin; 1784 PetscCall(VecGetArrayRead(xx, &x)); 1785 PetscCall(VecGetArrayWrite(zz, &zarray)); 1786 1787 v = a->a; 1788 if (usecprow) { 1789 mbs = a->compressedrow.nrows; 1790 ii = a->compressedrow.i; 1791 ridx = a->compressedrow.rindex; 1792 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1793 } else { 1794 mbs = a->mbs; 1795 ii = a->i; 1796 z = zarray; 1797 } 1798 1799 for (i = 0; i < mbs; i++) { 1800 n = ii[i + 1] - ii[i]; 1801 idx = ij + ii[i]; 1802 sum1 = 0.0; 1803 sum2 = 0.0; 1804 sum3 = 0.0; 1805 sum4 = 0.0; 1806 sum5 = 0.0; 1807 sum6 = 0.0; 1808 sum7 = 0.0; 1809 sum8 = 0.0; 1810 sum9 = 0.0; 1811 sum10 = 0.0; 1812 sum11 = 0.0; 1813 sum12 = 0.0; 1814 sum13 = 0.0; 1815 sum14 = 0.0; 1816 sum15 = 0.0; 1817 1818 for (j = 0; j < n; j++) { 1819 xb = x + 15 * (idx[j]); 1820 x1 = xb[0]; 1821 x2 = xb[1]; 1822 x3 = xb[2]; 1823 x4 = xb[3]; 1824 x5 = xb[4]; 1825 x6 = xb[5]; 1826 x7 = xb[6]; 1827 x8 = xb[7]; 1828 1829 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1830 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1831 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1832 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1833 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1834 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1835 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1836 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1837 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1838 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1839 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1840 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1841 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1842 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1843 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1844 v += 120; 1845 1846 x1 = xb[8]; 1847 x2 = xb[9]; 1848 x3 = xb[10]; 1849 x4 = xb[11]; 1850 x5 = xb[12]; 1851 x6 = xb[13]; 1852 x7 = xb[14]; 1853 1854 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1855 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1856 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1857 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1858 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1859 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1860 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1861 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1862 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1863 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1864 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1865 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1866 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1867 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1868 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1869 v += 105; 1870 } 1871 if (usecprow) z = zarray + 15 * ridx[i]; 1872 z[0] = sum1; 1873 z[1] = sum2; 1874 z[2] = sum3; 1875 z[3] = sum4; 1876 z[4] = sum5; 1877 z[5] = sum6; 1878 z[6] = sum7; 1879 z[7] = sum8; 1880 z[8] = sum9; 1881 z[9] = sum10; 1882 z[10] = sum11; 1883 z[11] = sum12; 1884 z[12] = sum13; 1885 z[13] = sum14; 1886 z[14] = sum15; 1887 1888 if (!usecprow) z += 15; 1889 } 1890 1891 PetscCall(VecRestoreArrayRead(xx, &x)); 1892 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1893 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1894 PetscFunctionReturn(PETSC_SUCCESS); 1895 } 1896 1897 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1898 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1899 { 1900 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1901 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1902 const PetscScalar *x, *xb; 1903 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1904 const MatScalar *v; 1905 const PetscInt *ii, *ij = a->j, *idx; 1906 PetscInt mbs, i, j, n, *ridx = NULL; 1907 PetscBool usecprow = a->compressedrow.use; 1908 1909 PetscFunctionBegin; 1910 PetscCall(VecGetArrayRead(xx, &x)); 1911 PetscCall(VecGetArrayWrite(zz, &zarray)); 1912 1913 v = a->a; 1914 if (usecprow) { 1915 mbs = a->compressedrow.nrows; 1916 ii = a->compressedrow.i; 1917 ridx = a->compressedrow.rindex; 1918 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1919 } else { 1920 mbs = a->mbs; 1921 ii = a->i; 1922 z = zarray; 1923 } 1924 1925 for (i = 0; i < mbs; i++) { 1926 n = ii[i + 1] - ii[i]; 1927 idx = ij + ii[i]; 1928 sum1 = 0.0; 1929 sum2 = 0.0; 1930 sum3 = 0.0; 1931 sum4 = 0.0; 1932 sum5 = 0.0; 1933 sum6 = 0.0; 1934 sum7 = 0.0; 1935 sum8 = 0.0; 1936 sum9 = 0.0; 1937 sum10 = 0.0; 1938 sum11 = 0.0; 1939 sum12 = 0.0; 1940 sum13 = 0.0; 1941 sum14 = 0.0; 1942 sum15 = 0.0; 1943 1944 for (j = 0; j < n; j++) { 1945 xb = x + 15 * (idx[j]); 1946 x1 = xb[0]; 1947 x2 = xb[1]; 1948 x3 = xb[2]; 1949 x4 = xb[3]; 1950 x5 = xb[4]; 1951 x6 = xb[5]; 1952 x7 = xb[6]; 1953 x8 = xb[7]; 1954 x9 = xb[8]; 1955 x10 = xb[9]; 1956 x11 = xb[10]; 1957 x12 = xb[11]; 1958 x13 = xb[12]; 1959 x14 = xb[13]; 1960 x15 = xb[14]; 1961 1962 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1963 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1964 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1965 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1966 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1967 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1968 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1969 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1970 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1971 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1972 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1973 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1974 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1975 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1976 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1977 v += 225; 1978 } 1979 if (usecprow) z = zarray + 15 * ridx[i]; 1980 z[0] = sum1; 1981 z[1] = sum2; 1982 z[2] = sum3; 1983 z[3] = sum4; 1984 z[4] = sum5; 1985 z[5] = sum6; 1986 z[6] = sum7; 1987 z[7] = sum8; 1988 z[8] = sum9; 1989 z[9] = sum10; 1990 z[10] = sum11; 1991 z[11] = sum12; 1992 z[12] = sum13; 1993 z[13] = sum14; 1994 z[14] = sum15; 1995 1996 if (!usecprow) z += 15; 1997 } 1998 1999 PetscCall(VecRestoreArrayRead(xx, &x)); 2000 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2001 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 2002 PetscFunctionReturn(PETSC_SUCCESS); 2003 } 2004 2005 /* 2006 This will not work with MatScalar == float because it calls the BLAS 2007 */ 2008 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 2009 { 2010 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2011 PetscScalar *z = NULL, *work, *workt, *zarray; 2012 const PetscScalar *x, *xb; 2013 const MatScalar *v; 2014 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2015 const PetscInt *idx, *ii, *ridx = NULL; 2016 PetscInt ncols, k; 2017 PetscBool usecprow = a->compressedrow.use; 2018 2019 PetscFunctionBegin; 2020 PetscCall(VecGetArrayRead(xx, &x)); 2021 PetscCall(VecGetArrayWrite(zz, &zarray)); 2022 2023 idx = a->j; 2024 v = a->a; 2025 if (usecprow) { 2026 mbs = a->compressedrow.nrows; 2027 ii = a->compressedrow.i; 2028 ridx = a->compressedrow.rindex; 2029 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2030 } else { 2031 mbs = a->mbs; 2032 ii = a->i; 2033 z = zarray; 2034 } 2035 2036 if (!a->mult_work) { 2037 k = PetscMax(A->rmap->n, A->cmap->n); 2038 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2039 } 2040 work = a->mult_work; 2041 for (i = 0; i < mbs; i++) { 2042 n = ii[1] - ii[0]; 2043 ii++; 2044 ncols = n * bs; 2045 workt = work; 2046 for (j = 0; j < n; j++) { 2047 xb = x + bs * (*idx++); 2048 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2049 workt += bs; 2050 } 2051 if (usecprow) z = zarray + bs * ridx[i]; 2052 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2053 v += n * bs2; 2054 if (!usecprow) z += bs; 2055 } 2056 PetscCall(VecRestoreArrayRead(xx, &x)); 2057 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2058 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2059 PetscFunctionReturn(PETSC_SUCCESS); 2060 } 2061 2062 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2063 { 2064 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2065 const PetscScalar *x; 2066 PetscScalar *y, *z, sum; 2067 const MatScalar *v; 2068 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2069 const PetscInt *idx, *ii; 2070 PetscBool usecprow = a->compressedrow.use; 2071 2072 PetscFunctionBegin; 2073 PetscCall(VecGetArrayRead(xx, &x)); 2074 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2075 2076 idx = a->j; 2077 v = a->a; 2078 if (usecprow) { 2079 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 2080 mbs = a->compressedrow.nrows; 2081 ii = a->compressedrow.i; 2082 ridx = a->compressedrow.rindex; 2083 } else { 2084 ii = a->i; 2085 } 2086 2087 for (i = 0; i < mbs; i++) { 2088 n = ii[1] - ii[0]; 2089 ii++; 2090 if (!usecprow) { 2091 sum = y[i]; 2092 } else { 2093 sum = y[ridx[i]]; 2094 } 2095 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2096 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2097 PetscSparseDensePlusDot(sum, x, v, idx, n); 2098 v += n; 2099 idx += n; 2100 if (usecprow) { 2101 z[ridx[i]] = sum; 2102 } else { 2103 z[i] = sum; 2104 } 2105 } 2106 PetscCall(VecRestoreArrayRead(xx, &x)); 2107 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2108 PetscCall(PetscLogFlops(2.0 * a->nz)); 2109 PetscFunctionReturn(PETSC_SUCCESS); 2110 } 2111 2112 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2113 { 2114 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2115 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2116 const PetscScalar *x, *xb; 2117 PetscScalar x1, x2, *yarray, *zarray; 2118 const MatScalar *v; 2119 PetscInt mbs = a->mbs, i, n, j; 2120 const PetscInt *idx, *ii, *ridx = NULL; 2121 PetscBool usecprow = a->compressedrow.use; 2122 2123 PetscFunctionBegin; 2124 PetscCall(VecGetArrayRead(xx, &x)); 2125 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2126 2127 idx = a->j; 2128 v = a->a; 2129 if (usecprow) { 2130 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 2131 mbs = a->compressedrow.nrows; 2132 ii = a->compressedrow.i; 2133 ridx = a->compressedrow.rindex; 2134 } else { 2135 ii = a->i; 2136 y = yarray; 2137 z = zarray; 2138 } 2139 2140 for (i = 0; i < mbs; i++) { 2141 n = ii[1] - ii[0]; 2142 ii++; 2143 if (usecprow) { 2144 z = zarray + 2 * ridx[i]; 2145 y = yarray + 2 * ridx[i]; 2146 } 2147 sum1 = y[0]; 2148 sum2 = y[1]; 2149 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2150 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2151 for (j = 0; j < n; j++) { 2152 xb = x + 2 * (*idx++); 2153 x1 = xb[0]; 2154 x2 = xb[1]; 2155 2156 sum1 += v[0] * x1 + v[2] * x2; 2157 sum2 += v[1] * x1 + v[3] * x2; 2158 v += 4; 2159 } 2160 z[0] = sum1; 2161 z[1] = sum2; 2162 if (!usecprow) { 2163 z += 2; 2164 y += 2; 2165 } 2166 } 2167 PetscCall(VecRestoreArrayRead(xx, &x)); 2168 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2169 PetscCall(PetscLogFlops(4.0 * a->nz)); 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2174 { 2175 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2176 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2177 const PetscScalar *x, *xb; 2178 const MatScalar *v; 2179 PetscInt mbs = a->mbs, i, j, n; 2180 const PetscInt *idx, *ii, *ridx = NULL; 2181 PetscBool usecprow = a->compressedrow.use; 2182 2183 PetscFunctionBegin; 2184 PetscCall(VecGetArrayRead(xx, &x)); 2185 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2186 2187 idx = a->j; 2188 v = a->a; 2189 if (usecprow) { 2190 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 2191 mbs = a->compressedrow.nrows; 2192 ii = a->compressedrow.i; 2193 ridx = a->compressedrow.rindex; 2194 } else { 2195 ii = a->i; 2196 y = yarray; 2197 z = zarray; 2198 } 2199 2200 for (i = 0; i < mbs; i++) { 2201 n = ii[1] - ii[0]; 2202 ii++; 2203 if (usecprow) { 2204 z = zarray + 3 * ridx[i]; 2205 y = yarray + 3 * ridx[i]; 2206 } 2207 sum1 = y[0]; 2208 sum2 = y[1]; 2209 sum3 = y[2]; 2210 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2211 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2212 for (j = 0; j < n; j++) { 2213 xb = x + 3 * (*idx++); 2214 x1 = xb[0]; 2215 x2 = xb[1]; 2216 x3 = xb[2]; 2217 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2218 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2219 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2220 v += 9; 2221 } 2222 z[0] = sum1; 2223 z[1] = sum2; 2224 z[2] = sum3; 2225 if (!usecprow) { 2226 z += 3; 2227 y += 3; 2228 } 2229 } 2230 PetscCall(VecRestoreArrayRead(xx, &x)); 2231 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2232 PetscCall(PetscLogFlops(18.0 * a->nz)); 2233 PetscFunctionReturn(PETSC_SUCCESS); 2234 } 2235 2236 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2237 { 2238 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2239 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2240 const PetscScalar *x, *xb; 2241 const MatScalar *v; 2242 PetscInt mbs = a->mbs, i, j, n; 2243 const PetscInt *idx, *ii, *ridx = NULL; 2244 PetscBool usecprow = a->compressedrow.use; 2245 2246 PetscFunctionBegin; 2247 PetscCall(VecGetArrayRead(xx, &x)); 2248 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2249 2250 idx = a->j; 2251 v = a->a; 2252 if (usecprow) { 2253 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 2254 mbs = a->compressedrow.nrows; 2255 ii = a->compressedrow.i; 2256 ridx = a->compressedrow.rindex; 2257 } else { 2258 ii = a->i; 2259 y = yarray; 2260 z = zarray; 2261 } 2262 2263 for (i = 0; i < mbs; i++) { 2264 n = ii[1] - ii[0]; 2265 ii++; 2266 if (usecprow) { 2267 z = zarray + 4 * ridx[i]; 2268 y = yarray + 4 * ridx[i]; 2269 } 2270 sum1 = y[0]; 2271 sum2 = y[1]; 2272 sum3 = y[2]; 2273 sum4 = y[3]; 2274 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2275 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2276 for (j = 0; j < n; j++) { 2277 xb = x + 4 * (*idx++); 2278 x1 = xb[0]; 2279 x2 = xb[1]; 2280 x3 = xb[2]; 2281 x4 = xb[3]; 2282 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2283 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2284 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2285 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2286 v += 16; 2287 } 2288 z[0] = sum1; 2289 z[1] = sum2; 2290 z[2] = sum3; 2291 z[3] = sum4; 2292 if (!usecprow) { 2293 z += 4; 2294 y += 4; 2295 } 2296 } 2297 PetscCall(VecRestoreArrayRead(xx, &x)); 2298 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2299 PetscCall(PetscLogFlops(32.0 * a->nz)); 2300 PetscFunctionReturn(PETSC_SUCCESS); 2301 } 2302 2303 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2304 { 2305 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2306 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2307 const PetscScalar *x, *xb; 2308 PetscScalar *yarray, *zarray; 2309 const MatScalar *v; 2310 PetscInt mbs = a->mbs, i, j, n; 2311 const PetscInt *idx, *ii, *ridx = NULL; 2312 PetscBool usecprow = a->compressedrow.use; 2313 2314 PetscFunctionBegin; 2315 PetscCall(VecGetArrayRead(xx, &x)); 2316 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2317 2318 idx = a->j; 2319 v = a->a; 2320 if (usecprow) { 2321 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 2322 mbs = a->compressedrow.nrows; 2323 ii = a->compressedrow.i; 2324 ridx = a->compressedrow.rindex; 2325 } else { 2326 ii = a->i; 2327 y = yarray; 2328 z = zarray; 2329 } 2330 2331 for (i = 0; i < mbs; i++) { 2332 n = ii[1] - ii[0]; 2333 ii++; 2334 if (usecprow) { 2335 z = zarray + 5 * ridx[i]; 2336 y = yarray + 5 * ridx[i]; 2337 } 2338 sum1 = y[0]; 2339 sum2 = y[1]; 2340 sum3 = y[2]; 2341 sum4 = y[3]; 2342 sum5 = y[4]; 2343 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2344 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2345 for (j = 0; j < n; j++) { 2346 xb = x + 5 * (*idx++); 2347 x1 = xb[0]; 2348 x2 = xb[1]; 2349 x3 = xb[2]; 2350 x4 = xb[3]; 2351 x5 = xb[4]; 2352 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2353 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2354 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2355 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2356 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2357 v += 25; 2358 } 2359 z[0] = sum1; 2360 z[1] = sum2; 2361 z[2] = sum3; 2362 z[3] = sum4; 2363 z[4] = sum5; 2364 if (!usecprow) { 2365 z += 5; 2366 y += 5; 2367 } 2368 } 2369 PetscCall(VecRestoreArrayRead(xx, &x)); 2370 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2371 PetscCall(PetscLogFlops(50.0 * a->nz)); 2372 PetscFunctionReturn(PETSC_SUCCESS); 2373 } 2374 2375 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2376 { 2377 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2378 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2379 const PetscScalar *x, *xb; 2380 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2381 const MatScalar *v; 2382 PetscInt mbs = a->mbs, i, j, n; 2383 const PetscInt *idx, *ii, *ridx = NULL; 2384 PetscBool usecprow = a->compressedrow.use; 2385 2386 PetscFunctionBegin; 2387 PetscCall(VecGetArrayRead(xx, &x)); 2388 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2389 2390 idx = a->j; 2391 v = a->a; 2392 if (usecprow) { 2393 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 2394 mbs = a->compressedrow.nrows; 2395 ii = a->compressedrow.i; 2396 ridx = a->compressedrow.rindex; 2397 } else { 2398 ii = a->i; 2399 y = yarray; 2400 z = zarray; 2401 } 2402 2403 for (i = 0; i < mbs; i++) { 2404 n = ii[1] - ii[0]; 2405 ii++; 2406 if (usecprow) { 2407 z = zarray + 6 * ridx[i]; 2408 y = yarray + 6 * ridx[i]; 2409 } 2410 sum1 = y[0]; 2411 sum2 = y[1]; 2412 sum3 = y[2]; 2413 sum4 = y[3]; 2414 sum5 = y[4]; 2415 sum6 = y[5]; 2416 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2417 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2418 for (j = 0; j < n; j++) { 2419 xb = x + 6 * (*idx++); 2420 x1 = xb[0]; 2421 x2 = xb[1]; 2422 x3 = xb[2]; 2423 x4 = xb[3]; 2424 x5 = xb[4]; 2425 x6 = xb[5]; 2426 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2427 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2428 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2429 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2430 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2431 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2432 v += 36; 2433 } 2434 z[0] = sum1; 2435 z[1] = sum2; 2436 z[2] = sum3; 2437 z[3] = sum4; 2438 z[4] = sum5; 2439 z[5] = sum6; 2440 if (!usecprow) { 2441 z += 6; 2442 y += 6; 2443 } 2444 } 2445 PetscCall(VecRestoreArrayRead(xx, &x)); 2446 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2447 PetscCall(PetscLogFlops(72.0 * a->nz)); 2448 PetscFunctionReturn(PETSC_SUCCESS); 2449 } 2450 2451 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2452 { 2453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2454 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2455 const PetscScalar *x, *xb; 2456 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2457 const MatScalar *v; 2458 PetscInt mbs = a->mbs, i, j, n; 2459 const PetscInt *idx, *ii, *ridx = NULL; 2460 PetscBool usecprow = a->compressedrow.use; 2461 2462 PetscFunctionBegin; 2463 PetscCall(VecGetArrayRead(xx, &x)); 2464 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2465 2466 idx = a->j; 2467 v = a->a; 2468 if (usecprow) { 2469 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2470 mbs = a->compressedrow.nrows; 2471 ii = a->compressedrow.i; 2472 ridx = a->compressedrow.rindex; 2473 } else { 2474 ii = a->i; 2475 y = yarray; 2476 z = zarray; 2477 } 2478 2479 for (i = 0; i < mbs; i++) { 2480 n = ii[1] - ii[0]; 2481 ii++; 2482 if (usecprow) { 2483 z = zarray + 7 * ridx[i]; 2484 y = yarray + 7 * ridx[i]; 2485 } 2486 sum1 = y[0]; 2487 sum2 = y[1]; 2488 sum3 = y[2]; 2489 sum4 = y[3]; 2490 sum5 = y[4]; 2491 sum6 = y[5]; 2492 sum7 = y[6]; 2493 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2494 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2495 for (j = 0; j < n; j++) { 2496 xb = x + 7 * (*idx++); 2497 x1 = xb[0]; 2498 x2 = xb[1]; 2499 x3 = xb[2]; 2500 x4 = xb[3]; 2501 x5 = xb[4]; 2502 x6 = xb[5]; 2503 x7 = xb[6]; 2504 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2505 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2506 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2507 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2508 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2509 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2510 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2511 v += 49; 2512 } 2513 z[0] = sum1; 2514 z[1] = sum2; 2515 z[2] = sum3; 2516 z[3] = sum4; 2517 z[4] = sum5; 2518 z[5] = sum6; 2519 z[6] = sum7; 2520 if (!usecprow) { 2521 z += 7; 2522 y += 7; 2523 } 2524 } 2525 PetscCall(VecRestoreArrayRead(xx, &x)); 2526 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2527 PetscCall(PetscLogFlops(98.0 * a->nz)); 2528 PetscFunctionReturn(PETSC_SUCCESS); 2529 } 2530 2531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2532 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2533 { 2534 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2535 PetscScalar *z = NULL, *work, *workt, *zarray; 2536 const PetscScalar *x, *xb; 2537 const MatScalar *v; 2538 PetscInt mbs, i, j, n; 2539 PetscInt k; 2540 PetscBool usecprow = a->compressedrow.use; 2541 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2542 2543 __m256d a0, a1, a2, a3, a4, a5; 2544 __m256d w0, w1, w2, w3; 2545 __m256d z0, z1, z2; 2546 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2547 2548 PetscFunctionBegin; 2549 PetscCall(VecCopy(yy, zz)); 2550 PetscCall(VecGetArrayRead(xx, &x)); 2551 PetscCall(VecGetArray(zz, &zarray)); 2552 2553 idx = a->j; 2554 v = a->a; 2555 if (usecprow) { 2556 mbs = a->compressedrow.nrows; 2557 ii = a->compressedrow.i; 2558 ridx = a->compressedrow.rindex; 2559 } else { 2560 mbs = a->mbs; 2561 ii = a->i; 2562 z = zarray; 2563 } 2564 2565 if (!a->mult_work) { 2566 k = PetscMax(A->rmap->n, A->cmap->n); 2567 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2568 } 2569 2570 work = a->mult_work; 2571 for (i = 0; i < mbs; i++) { 2572 n = ii[1] - ii[0]; 2573 ii++; 2574 workt = work; 2575 for (j = 0; j < n; j++) { 2576 xb = x + bs * (*idx++); 2577 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2578 workt += bs; 2579 } 2580 if (usecprow) z = zarray + bs * ridx[i]; 2581 2582 z0 = _mm256_loadu_pd(&z[0]); 2583 z1 = _mm256_loadu_pd(&z[4]); 2584 z2 = _mm256_set1_pd(z[8]); 2585 2586 for (j = 0; j < n; j++) { 2587 /* first column of a */ 2588 w0 = _mm256_set1_pd(work[j * 9]); 2589 a0 = _mm256_loadu_pd(&v[j * 81]); 2590 z0 = _mm256_fmadd_pd(a0, w0, z0); 2591 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2592 z1 = _mm256_fmadd_pd(a1, w0, z1); 2593 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2594 z2 = _mm256_fmadd_pd(a2, w0, z2); 2595 2596 /* second column of a */ 2597 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2598 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2599 z0 = _mm256_fmadd_pd(a0, w1, z0); 2600 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2601 z1 = _mm256_fmadd_pd(a1, w1, z1); 2602 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2603 z2 = _mm256_fmadd_pd(a2, w1, z2); 2604 2605 /* third column of a */ 2606 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2607 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2608 z0 = _mm256_fmadd_pd(a3, w2, z0); 2609 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2610 z1 = _mm256_fmadd_pd(a4, w2, z1); 2611 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2612 z2 = _mm256_fmadd_pd(a5, w2, z2); 2613 2614 /* fourth column of a */ 2615 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2616 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2617 z0 = _mm256_fmadd_pd(a0, w3, z0); 2618 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2619 z1 = _mm256_fmadd_pd(a1, w3, z1); 2620 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2621 z2 = _mm256_fmadd_pd(a2, w3, z2); 2622 2623 /* fifth column of a */ 2624 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2625 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2626 z0 = _mm256_fmadd_pd(a3, w0, z0); 2627 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2628 z1 = _mm256_fmadd_pd(a4, w0, z1); 2629 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2630 z2 = _mm256_fmadd_pd(a5, w0, z2); 2631 2632 /* sixth column of a */ 2633 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2634 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2635 z0 = _mm256_fmadd_pd(a0, w1, z0); 2636 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2637 z1 = _mm256_fmadd_pd(a1, w1, z1); 2638 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2639 z2 = _mm256_fmadd_pd(a2, w1, z2); 2640 2641 /* seventh column of a */ 2642 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2643 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2644 z0 = _mm256_fmadd_pd(a0, w2, z0); 2645 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2646 z1 = _mm256_fmadd_pd(a1, w2, z1); 2647 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2648 z2 = _mm256_fmadd_pd(a2, w2, z2); 2649 2650 /* eighth column of a */ 2651 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2652 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2653 z0 = _mm256_fmadd_pd(a3, w3, z0); 2654 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2655 z1 = _mm256_fmadd_pd(a4, w3, z1); 2656 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2657 z2 = _mm256_fmadd_pd(a5, w3, z2); 2658 2659 /* ninth column of a */ 2660 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2661 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2662 z0 = _mm256_fmadd_pd(a0, w0, z0); 2663 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2664 z1 = _mm256_fmadd_pd(a1, w0, z1); 2665 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2666 z2 = _mm256_fmadd_pd(a2, w0, z2); 2667 } 2668 2669 _mm256_storeu_pd(&z[0], z0); 2670 _mm256_storeu_pd(&z[4], z1); 2671 _mm256_maskstore_pd(&z[8], mask1, z2); 2672 2673 v += n * bs2; 2674 if (!usecprow) z += bs; 2675 } 2676 PetscCall(VecRestoreArrayRead(xx, &x)); 2677 PetscCall(VecRestoreArray(zz, &zarray)); 2678 PetscCall(PetscLogFlops(162.0 * a->nz)); 2679 PetscFunctionReturn(PETSC_SUCCESS); 2680 } 2681 #endif 2682 2683 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2684 { 2685 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2686 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2687 const PetscScalar *x, *xb; 2688 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2689 const MatScalar *v; 2690 PetscInt mbs = a->mbs, i, j, n; 2691 const PetscInt *idx, *ii, *ridx = NULL; 2692 PetscBool usecprow = a->compressedrow.use; 2693 2694 PetscFunctionBegin; 2695 PetscCall(VecGetArrayRead(xx, &x)); 2696 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2697 2698 idx = a->j; 2699 v = a->a; 2700 if (usecprow) { 2701 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2702 mbs = a->compressedrow.nrows; 2703 ii = a->compressedrow.i; 2704 ridx = a->compressedrow.rindex; 2705 } else { 2706 ii = a->i; 2707 y = yarray; 2708 z = zarray; 2709 } 2710 2711 for (i = 0; i < mbs; i++) { 2712 n = ii[1] - ii[0]; 2713 ii++; 2714 if (usecprow) { 2715 z = zarray + 11 * ridx[i]; 2716 y = yarray + 11 * ridx[i]; 2717 } 2718 sum1 = y[0]; 2719 sum2 = y[1]; 2720 sum3 = y[2]; 2721 sum4 = y[3]; 2722 sum5 = y[4]; 2723 sum6 = y[5]; 2724 sum7 = y[6]; 2725 sum8 = y[7]; 2726 sum9 = y[8]; 2727 sum10 = y[9]; 2728 sum11 = y[10]; 2729 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2730 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2731 for (j = 0; j < n; j++) { 2732 xb = x + 11 * (*idx++); 2733 x1 = xb[0]; 2734 x2 = xb[1]; 2735 x3 = xb[2]; 2736 x4 = xb[3]; 2737 x5 = xb[4]; 2738 x6 = xb[5]; 2739 x7 = xb[6]; 2740 x8 = xb[7]; 2741 x9 = xb[8]; 2742 x10 = xb[9]; 2743 x11 = xb[10]; 2744 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2745 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2746 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2747 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2748 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2749 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2750 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2751 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2752 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2753 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2754 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2755 v += 121; 2756 } 2757 z[0] = sum1; 2758 z[1] = sum2; 2759 z[2] = sum3; 2760 z[3] = sum4; 2761 z[4] = sum5; 2762 z[5] = sum6; 2763 z[6] = sum7; 2764 z[7] = sum8; 2765 z[8] = sum9; 2766 z[9] = sum10; 2767 z[10] = sum11; 2768 if (!usecprow) { 2769 z += 11; 2770 y += 11; 2771 } 2772 } 2773 PetscCall(VecRestoreArrayRead(xx, &x)); 2774 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2775 PetscCall(PetscLogFlops(242.0 * a->nz)); 2776 PetscFunctionReturn(PETSC_SUCCESS); 2777 } 2778 2779 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2780 { 2781 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2782 PetscScalar *z = NULL, *work, *workt, *zarray; 2783 const PetscScalar *x, *xb; 2784 const MatScalar *v; 2785 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2786 PetscInt ncols, k; 2787 const PetscInt *ridx = NULL, *idx, *ii; 2788 PetscBool usecprow = a->compressedrow.use; 2789 2790 PetscFunctionBegin; 2791 PetscCall(VecCopy(yy, zz)); 2792 PetscCall(VecGetArrayRead(xx, &x)); 2793 PetscCall(VecGetArray(zz, &zarray)); 2794 2795 idx = a->j; 2796 v = a->a; 2797 if (usecprow) { 2798 mbs = a->compressedrow.nrows; 2799 ii = a->compressedrow.i; 2800 ridx = a->compressedrow.rindex; 2801 } else { 2802 mbs = a->mbs; 2803 ii = a->i; 2804 z = zarray; 2805 } 2806 2807 if (!a->mult_work) { 2808 k = PetscMax(A->rmap->n, A->cmap->n); 2809 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2810 } 2811 work = a->mult_work; 2812 for (i = 0; i < mbs; i++) { 2813 n = ii[1] - ii[0]; 2814 ii++; 2815 ncols = n * bs; 2816 workt = work; 2817 for (j = 0; j < n; j++) { 2818 xb = x + bs * (*idx++); 2819 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2820 workt += bs; 2821 } 2822 if (usecprow) z = zarray + bs * ridx[i]; 2823 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2824 v += n * bs2; 2825 if (!usecprow) z += bs; 2826 } 2827 PetscCall(VecRestoreArrayRead(xx, &x)); 2828 PetscCall(VecRestoreArray(zz, &zarray)); 2829 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2830 PetscFunctionReturn(PETSC_SUCCESS); 2831 } 2832 2833 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2834 { 2835 PetscScalar zero = 0.0; 2836 2837 PetscFunctionBegin; 2838 PetscCall(VecSet(zz, zero)); 2839 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2840 PetscFunctionReturn(PETSC_SUCCESS); 2841 } 2842 2843 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2844 { 2845 PetscScalar zero = 0.0; 2846 2847 PetscFunctionBegin; 2848 PetscCall(VecSet(zz, zero)); 2849 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2850 PetscFunctionReturn(PETSC_SUCCESS); 2851 } 2852 2853 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2854 { 2855 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2856 PetscScalar *z, x1, x2, x3, x4, x5; 2857 const PetscScalar *x, *xb = NULL; 2858 const MatScalar *v; 2859 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2860 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2861 Mat_CompressedRow cprow = a->compressedrow; 2862 PetscBool usecprow = cprow.use; 2863 2864 PetscFunctionBegin; 2865 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2866 PetscCall(VecGetArrayRead(xx, &x)); 2867 PetscCall(VecGetArray(zz, &z)); 2868 2869 idx = a->j; 2870 v = a->a; 2871 if (usecprow) { 2872 mbs = cprow.nrows; 2873 ii = cprow.i; 2874 ridx = cprow.rindex; 2875 } else { 2876 mbs = a->mbs; 2877 ii = a->i; 2878 xb = x; 2879 } 2880 2881 switch (bs) { 2882 case 1: 2883 for (i = 0; i < mbs; i++) { 2884 if (usecprow) xb = x + ridx[i]; 2885 x1 = xb[0]; 2886 ib = idx + ii[0]; 2887 n = ii[1] - ii[0]; 2888 ii++; 2889 for (j = 0; j < n; j++) { 2890 rval = ib[j]; 2891 z[rval] += PetscConj(*v) * x1; 2892 v++; 2893 } 2894 if (!usecprow) xb++; 2895 } 2896 break; 2897 case 2: 2898 for (i = 0; i < mbs; i++) { 2899 if (usecprow) xb = x + 2 * ridx[i]; 2900 x1 = xb[0]; 2901 x2 = xb[1]; 2902 ib = idx + ii[0]; 2903 n = ii[1] - ii[0]; 2904 ii++; 2905 for (j = 0; j < n; j++) { 2906 rval = ib[j] * 2; 2907 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2908 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2909 v += 4; 2910 } 2911 if (!usecprow) xb += 2; 2912 } 2913 break; 2914 case 3: 2915 for (i = 0; i < mbs; i++) { 2916 if (usecprow) xb = x + 3 * ridx[i]; 2917 x1 = xb[0]; 2918 x2 = xb[1]; 2919 x3 = xb[2]; 2920 ib = idx + ii[0]; 2921 n = ii[1] - ii[0]; 2922 ii++; 2923 for (j = 0; j < n; j++) { 2924 rval = ib[j] * 3; 2925 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2926 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2927 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2928 v += 9; 2929 } 2930 if (!usecprow) xb += 3; 2931 } 2932 break; 2933 case 4: 2934 for (i = 0; i < mbs; i++) { 2935 if (usecprow) xb = x + 4 * ridx[i]; 2936 x1 = xb[0]; 2937 x2 = xb[1]; 2938 x3 = xb[2]; 2939 x4 = xb[3]; 2940 ib = idx + ii[0]; 2941 n = ii[1] - ii[0]; 2942 ii++; 2943 for (j = 0; j < n; j++) { 2944 rval = ib[j] * 4; 2945 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2946 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2947 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2948 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2949 v += 16; 2950 } 2951 if (!usecprow) xb += 4; 2952 } 2953 break; 2954 case 5: 2955 for (i = 0; i < mbs; i++) { 2956 if (usecprow) xb = x + 5 * ridx[i]; 2957 x1 = xb[0]; 2958 x2 = xb[1]; 2959 x3 = xb[2]; 2960 x4 = xb[3]; 2961 x5 = xb[4]; 2962 ib = idx + ii[0]; 2963 n = ii[1] - ii[0]; 2964 ii++; 2965 for (j = 0; j < n; j++) { 2966 rval = ib[j] * 5; 2967 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2968 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2969 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2970 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2971 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2972 v += 25; 2973 } 2974 if (!usecprow) xb += 5; 2975 } 2976 break; 2977 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2978 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2979 #if 0 2980 { 2981 PetscInt ncols,k,bs2=a->bs2; 2982 PetscScalar *work,*workt,zb; 2983 const PetscScalar *xtmp; 2984 if (!a->mult_work) { 2985 k = PetscMax(A->rmap->n,A->cmap->n); 2986 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2987 } 2988 work = a->mult_work; 2989 xtmp = x; 2990 for (i=0; i<mbs; i++) { 2991 n = ii[1] - ii[0]; ii++; 2992 ncols = n*bs; 2993 PetscCall(PetscArrayzero(work,ncols)); 2994 if (usecprow) xtmp = x + bs*ridx[i]; 2995 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2996 v += n*bs2; 2997 if (!usecprow) xtmp += bs; 2998 workt = work; 2999 for (j=0; j<n; j++) { 3000 zb = z + bs*(*idx++); 3001 for (k=0; k<bs; k++) zb[k] += workt[k] ; 3002 workt += bs; 3003 } 3004 } 3005 } 3006 #endif 3007 } 3008 PetscCall(VecRestoreArrayRead(xx, &x)); 3009 PetscCall(VecRestoreArray(zz, &z)); 3010 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3011 PetscFunctionReturn(PETSC_SUCCESS); 3012 } 3013 3014 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 3015 { 3016 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3017 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 3018 const PetscScalar *x, *xb = NULL; 3019 const MatScalar *v; 3020 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3021 const PetscInt *idx, *ii, *ib, *ridx = NULL; 3022 Mat_CompressedRow cprow = a->compressedrow; 3023 PetscBool usecprow = cprow.use; 3024 3025 PetscFunctionBegin; 3026 if (yy != zz) PetscCall(VecCopy(yy, zz)); 3027 PetscCall(VecGetArrayRead(xx, &x)); 3028 PetscCall(VecGetArray(zz, &z)); 3029 3030 idx = a->j; 3031 v = a->a; 3032 if (usecprow) { 3033 mbs = cprow.nrows; 3034 ii = cprow.i; 3035 ridx = cprow.rindex; 3036 } else { 3037 mbs = a->mbs; 3038 ii = a->i; 3039 xb = x; 3040 } 3041 3042 switch (bs) { 3043 case 1: 3044 for (i = 0; i < mbs; i++) { 3045 if (usecprow) xb = x + ridx[i]; 3046 x1 = xb[0]; 3047 ib = idx + ii[0]; 3048 n = ii[1] - ii[0]; 3049 ii++; 3050 for (j = 0; j < n; j++) { 3051 rval = ib[j]; 3052 z[rval] += *v * x1; 3053 v++; 3054 } 3055 if (!usecprow) xb++; 3056 } 3057 break; 3058 case 2: 3059 for (i = 0; i < mbs; i++) { 3060 if (usecprow) xb = x + 2 * ridx[i]; 3061 x1 = xb[0]; 3062 x2 = xb[1]; 3063 ib = idx + ii[0]; 3064 n = ii[1] - ii[0]; 3065 ii++; 3066 for (j = 0; j < n; j++) { 3067 rval = ib[j] * 2; 3068 z[rval++] += v[0] * x1 + v[1] * x2; 3069 z[rval++] += v[2] * x1 + v[3] * x2; 3070 v += 4; 3071 } 3072 if (!usecprow) xb += 2; 3073 } 3074 break; 3075 case 3: 3076 for (i = 0; i < mbs; i++) { 3077 if (usecprow) xb = x + 3 * ridx[i]; 3078 x1 = xb[0]; 3079 x2 = xb[1]; 3080 x3 = xb[2]; 3081 ib = idx + ii[0]; 3082 n = ii[1] - ii[0]; 3083 ii++; 3084 for (j = 0; j < n; j++) { 3085 rval = ib[j] * 3; 3086 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3087 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3088 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3089 v += 9; 3090 } 3091 if (!usecprow) xb += 3; 3092 } 3093 break; 3094 case 4: 3095 for (i = 0; i < mbs; i++) { 3096 if (usecprow) xb = x + 4 * ridx[i]; 3097 x1 = xb[0]; 3098 x2 = xb[1]; 3099 x3 = xb[2]; 3100 x4 = xb[3]; 3101 ib = idx + ii[0]; 3102 n = ii[1] - ii[0]; 3103 ii++; 3104 for (j = 0; j < n; j++) { 3105 rval = ib[j] * 4; 3106 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3107 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3108 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3109 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3110 v += 16; 3111 } 3112 if (!usecprow) xb += 4; 3113 } 3114 break; 3115 case 5: 3116 for (i = 0; i < mbs; i++) { 3117 if (usecprow) xb = x + 5 * ridx[i]; 3118 x1 = xb[0]; 3119 x2 = xb[1]; 3120 x3 = xb[2]; 3121 x4 = xb[3]; 3122 x5 = xb[4]; 3123 ib = idx + ii[0]; 3124 n = ii[1] - ii[0]; 3125 ii++; 3126 for (j = 0; j < n; j++) { 3127 rval = ib[j] * 5; 3128 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3129 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3130 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3131 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3132 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3133 v += 25; 3134 } 3135 if (!usecprow) xb += 5; 3136 } 3137 break; 3138 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3139 PetscInt ncols, k; 3140 PetscScalar *work, *workt; 3141 const PetscScalar *xtmp; 3142 if (!a->mult_work) { 3143 k = PetscMax(A->rmap->n, A->cmap->n); 3144 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3145 } 3146 work = a->mult_work; 3147 xtmp = x; 3148 for (i = 0; i < mbs; i++) { 3149 n = ii[1] - ii[0]; 3150 ii++; 3151 ncols = n * bs; 3152 PetscCall(PetscArrayzero(work, ncols)); 3153 if (usecprow) xtmp = x + bs * ridx[i]; 3154 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3155 v += n * bs2; 3156 if (!usecprow) xtmp += bs; 3157 workt = work; 3158 for (j = 0; j < n; j++) { 3159 zb = z + bs * (*idx++); 3160 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3161 workt += bs; 3162 } 3163 } 3164 } 3165 } 3166 PetscCall(VecRestoreArrayRead(xx, &x)); 3167 PetscCall(VecRestoreArray(zz, &z)); 3168 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3169 PetscFunctionReturn(PETSC_SUCCESS); 3170 } 3171 3172 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3173 { 3174 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3175 PetscInt totalnz = a->bs2 * a->nz; 3176 PetscScalar oalpha = alpha; 3177 PetscBLASInt one = 1, tnz; 3178 3179 PetscFunctionBegin; 3180 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3181 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3182 PetscCall(PetscLogFlops(totalnz)); 3183 PetscFunctionReturn(PETSC_SUCCESS); 3184 } 3185 3186 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3187 { 3188 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3189 MatScalar *v = a->a; 3190 PetscReal sum = 0.0; 3191 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3192 3193 PetscFunctionBegin; 3194 if (type == NORM_FROBENIUS) { 3195 #if defined(PETSC_USE_REAL___FP16) 3196 PetscBLASInt one = 1, cnt = bs2 * nz; 3197 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3198 #else 3199 for (i = 0; i < bs2 * nz; i++) { 3200 sum += PetscRealPart(PetscConj(*v) * (*v)); 3201 v++; 3202 } 3203 #endif 3204 *norm = PetscSqrtReal(sum); 3205 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3206 } else if (type == NORM_1) { /* maximum column sum */ 3207 PetscReal *tmp; 3208 PetscInt *bcol = a->j; 3209 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3210 for (i = 0; i < nz; i++) { 3211 for (j = 0; j < bs; j++) { 3212 k1 = bs * (*bcol) + j; /* column index */ 3213 for (k = 0; k < bs; k++) { 3214 tmp[k1] += PetscAbsScalar(*v); 3215 v++; 3216 } 3217 } 3218 bcol++; 3219 } 3220 *norm = 0.0; 3221 for (j = 0; j < A->cmap->n; j++) { 3222 if (tmp[j] > *norm) *norm = tmp[j]; 3223 } 3224 PetscCall(PetscFree(tmp)); 3225 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3226 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3227 *norm = 0.0; 3228 for (k = 0; k < bs; k++) { 3229 for (j = 0; j < a->mbs; j++) { 3230 v = a->a + bs2 * a->i[j] + k; 3231 sum = 0.0; 3232 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3233 for (k1 = 0; k1 < bs; k1++) { 3234 sum += PetscAbsScalar(*v); 3235 v += bs; 3236 } 3237 } 3238 if (sum > *norm) *norm = sum; 3239 } 3240 } 3241 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3242 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3243 PetscFunctionReturn(PETSC_SUCCESS); 3244 } 3245 3246 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3247 { 3248 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3249 3250 PetscFunctionBegin; 3251 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3252 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3253 *flg = PETSC_FALSE; 3254 PetscFunctionReturn(PETSC_SUCCESS); 3255 } 3256 3257 /* if the a->i are the same */ 3258 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3259 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3260 3261 /* if a->j are the same */ 3262 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3263 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3264 3265 /* if a->a are the same */ 3266 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3267 PetscFunctionReturn(PETSC_SUCCESS); 3268 } 3269 3270 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3271 { 3272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3273 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3274 PetscScalar *x, zero = 0.0; 3275 MatScalar *aa, *aa_j; 3276 3277 PetscFunctionBegin; 3278 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3279 bs = A->rmap->bs; 3280 aa = a->a; 3281 ai = a->i; 3282 aj = a->j; 3283 ambs = a->mbs; 3284 bs2 = a->bs2; 3285 3286 PetscCall(VecSet(v, zero)); 3287 PetscCall(VecGetArray(v, &x)); 3288 PetscCall(VecGetLocalSize(v, &n)); 3289 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3290 for (i = 0; i < ambs; i++) { 3291 for (j = ai[i]; j < ai[i + 1]; j++) { 3292 if (aj[j] == i) { 3293 row = i * bs; 3294 aa_j = aa + j * bs2; 3295 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3296 break; 3297 } 3298 } 3299 } 3300 PetscCall(VecRestoreArray(v, &x)); 3301 PetscFunctionReturn(PETSC_SUCCESS); 3302 } 3303 3304 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3305 { 3306 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3307 const PetscScalar *l, *r, *li, *ri; 3308 PetscScalar x; 3309 MatScalar *aa, *v; 3310 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3311 const PetscInt *ai, *aj; 3312 3313 PetscFunctionBegin; 3314 ai = a->i; 3315 aj = a->j; 3316 aa = a->a; 3317 m = A->rmap->n; 3318 n = A->cmap->n; 3319 bs = A->rmap->bs; 3320 mbs = a->mbs; 3321 bs2 = a->bs2; 3322 if (ll) { 3323 PetscCall(VecGetArrayRead(ll, &l)); 3324 PetscCall(VecGetLocalSize(ll, &lm)); 3325 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3326 for (i = 0; i < mbs; i++) { /* for each block row */ 3327 M = ai[i + 1] - ai[i]; 3328 li = l + i * bs; 3329 v = aa + bs2 * ai[i]; 3330 for (j = 0; j < M; j++) { /* for each block */ 3331 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 3332 } 3333 } 3334 PetscCall(VecRestoreArrayRead(ll, &l)); 3335 PetscCall(PetscLogFlops(a->nz)); 3336 } 3337 3338 if (rr) { 3339 PetscCall(VecGetArrayRead(rr, &r)); 3340 PetscCall(VecGetLocalSize(rr, &rn)); 3341 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3342 for (i = 0; i < mbs; i++) { /* for each block row */ 3343 iai = ai[i]; 3344 M = ai[i + 1] - iai; 3345 v = aa + bs2 * iai; 3346 for (j = 0; j < M; j++) { /* for each block */ 3347 ri = r + bs * aj[iai + j]; 3348 for (k = 0; k < bs; k++) { 3349 x = ri[k]; 3350 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3351 v += bs; 3352 } 3353 } 3354 } 3355 PetscCall(VecRestoreArrayRead(rr, &r)); 3356 PetscCall(PetscLogFlops(a->nz)); 3357 } 3358 PetscFunctionReturn(PETSC_SUCCESS); 3359 } 3360 3361 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3362 { 3363 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3364 3365 PetscFunctionBegin; 3366 info->block_size = a->bs2; 3367 info->nz_allocated = a->bs2 * a->maxnz; 3368 info->nz_used = a->bs2 * a->nz; 3369 info->nz_unneeded = info->nz_allocated - info->nz_used; 3370 info->assemblies = A->num_ass; 3371 info->mallocs = A->info.mallocs; 3372 info->memory = 0; /* REVIEW ME */ 3373 if (A->factortype) { 3374 info->fill_ratio_given = A->info.fill_ratio_given; 3375 info->fill_ratio_needed = A->info.fill_ratio_needed; 3376 info->factor_mallocs = A->info.factor_mallocs; 3377 } else { 3378 info->fill_ratio_given = 0; 3379 info->fill_ratio_needed = 0; 3380 info->factor_mallocs = 0; 3381 } 3382 PetscFunctionReturn(PETSC_SUCCESS); 3383 } 3384 3385 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3386 { 3387 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3388 3389 PetscFunctionBegin; 3390 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3391 PetscFunctionReturn(PETSC_SUCCESS); 3392 } 3393 3394 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3395 { 3396 PetscFunctionBegin; 3397 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3398 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3399 PetscFunctionReturn(PETSC_SUCCESS); 3400 } 3401 3402 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3403 { 3404 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3405 PetscScalar *z = NULL, sum1; 3406 const PetscScalar *xb; 3407 PetscScalar x1; 3408 const MatScalar *v, *vv; 3409 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3410 PetscBool usecprow = a->compressedrow.use; 3411 3412 PetscFunctionBegin; 3413 idx = a->j; 3414 v = a->a; 3415 if (usecprow) { 3416 mbs = a->compressedrow.nrows; 3417 ii = a->compressedrow.i; 3418 ridx = a->compressedrow.rindex; 3419 } else { 3420 mbs = a->mbs; 3421 ii = a->i; 3422 z = c; 3423 } 3424 3425 for (i = 0; i < mbs; i++) { 3426 n = ii[1] - ii[0]; 3427 ii++; 3428 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3429 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3430 if (usecprow) z = c + ridx[i]; 3431 jj = idx; 3432 vv = v; 3433 for (k = 0; k < cn; k++) { 3434 idx = jj; 3435 v = vv; 3436 sum1 = 0.0; 3437 for (j = 0; j < n; j++) { 3438 xb = b + (*idx++); 3439 x1 = xb[0 + k * bm]; 3440 sum1 += v[0] * x1; 3441 v += 1; 3442 } 3443 z[0 + k * cm] = sum1; 3444 } 3445 if (!usecprow) z += 1; 3446 } 3447 PetscFunctionReturn(PETSC_SUCCESS); 3448 } 3449 3450 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3451 { 3452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3453 PetscScalar *z = NULL, sum1, sum2; 3454 const PetscScalar *xb; 3455 PetscScalar x1, x2; 3456 const MatScalar *v, *vv; 3457 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3458 PetscBool usecprow = a->compressedrow.use; 3459 3460 PetscFunctionBegin; 3461 idx = a->j; 3462 v = a->a; 3463 if (usecprow) { 3464 mbs = a->compressedrow.nrows; 3465 ii = a->compressedrow.i; 3466 ridx = a->compressedrow.rindex; 3467 } else { 3468 mbs = a->mbs; 3469 ii = a->i; 3470 z = c; 3471 } 3472 3473 for (i = 0; i < mbs; i++) { 3474 n = ii[1] - ii[0]; 3475 ii++; 3476 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3477 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3478 if (usecprow) z = c + 2 * ridx[i]; 3479 jj = idx; 3480 vv = v; 3481 for (k = 0; k < cn; k++) { 3482 idx = jj; 3483 v = vv; 3484 sum1 = 0.0; 3485 sum2 = 0.0; 3486 for (j = 0; j < n; j++) { 3487 xb = b + 2 * (*idx++); 3488 x1 = xb[0 + k * bm]; 3489 x2 = xb[1 + k * bm]; 3490 sum1 += v[0] * x1 + v[2] * x2; 3491 sum2 += v[1] * x1 + v[3] * x2; 3492 v += 4; 3493 } 3494 z[0 + k * cm] = sum1; 3495 z[1 + k * cm] = sum2; 3496 } 3497 if (!usecprow) z += 2; 3498 } 3499 PetscFunctionReturn(PETSC_SUCCESS); 3500 } 3501 3502 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3503 { 3504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3505 PetscScalar *z = NULL, sum1, sum2, sum3; 3506 const PetscScalar *xb; 3507 PetscScalar x1, x2, x3; 3508 const MatScalar *v, *vv; 3509 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3510 PetscBool usecprow = a->compressedrow.use; 3511 3512 PetscFunctionBegin; 3513 idx = a->j; 3514 v = a->a; 3515 if (usecprow) { 3516 mbs = a->compressedrow.nrows; 3517 ii = a->compressedrow.i; 3518 ridx = a->compressedrow.rindex; 3519 } else { 3520 mbs = a->mbs; 3521 ii = a->i; 3522 z = c; 3523 } 3524 3525 for (i = 0; i < mbs; i++) { 3526 n = ii[1] - ii[0]; 3527 ii++; 3528 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3529 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3530 if (usecprow) z = c + 3 * ridx[i]; 3531 jj = idx; 3532 vv = v; 3533 for (k = 0; k < cn; k++) { 3534 idx = jj; 3535 v = vv; 3536 sum1 = 0.0; 3537 sum2 = 0.0; 3538 sum3 = 0.0; 3539 for (j = 0; j < n; j++) { 3540 xb = b + 3 * (*idx++); 3541 x1 = xb[0 + k * bm]; 3542 x2 = xb[1 + k * bm]; 3543 x3 = xb[2 + k * bm]; 3544 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3545 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3546 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3547 v += 9; 3548 } 3549 z[0 + k * cm] = sum1; 3550 z[1 + k * cm] = sum2; 3551 z[2 + k * cm] = sum3; 3552 } 3553 if (!usecprow) z += 3; 3554 } 3555 PetscFunctionReturn(PETSC_SUCCESS); 3556 } 3557 3558 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3559 { 3560 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3561 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3562 const PetscScalar *xb; 3563 PetscScalar x1, x2, x3, x4; 3564 const MatScalar *v, *vv; 3565 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3566 PetscBool usecprow = a->compressedrow.use; 3567 3568 PetscFunctionBegin; 3569 idx = a->j; 3570 v = a->a; 3571 if (usecprow) { 3572 mbs = a->compressedrow.nrows; 3573 ii = a->compressedrow.i; 3574 ridx = a->compressedrow.rindex; 3575 } else { 3576 mbs = a->mbs; 3577 ii = a->i; 3578 z = c; 3579 } 3580 3581 for (i = 0; i < mbs; i++) { 3582 n = ii[1] - ii[0]; 3583 ii++; 3584 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3585 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3586 if (usecprow) z = c + 4 * ridx[i]; 3587 jj = idx; 3588 vv = v; 3589 for (k = 0; k < cn; k++) { 3590 idx = jj; 3591 v = vv; 3592 sum1 = 0.0; 3593 sum2 = 0.0; 3594 sum3 = 0.0; 3595 sum4 = 0.0; 3596 for (j = 0; j < n; j++) { 3597 xb = b + 4 * (*idx++); 3598 x1 = xb[0 + k * bm]; 3599 x2 = xb[1 + k * bm]; 3600 x3 = xb[2 + k * bm]; 3601 x4 = xb[3 + k * bm]; 3602 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3603 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3604 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3605 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3606 v += 16; 3607 } 3608 z[0 + k * cm] = sum1; 3609 z[1 + k * cm] = sum2; 3610 z[2 + k * cm] = sum3; 3611 z[3 + k * cm] = sum4; 3612 } 3613 if (!usecprow) z += 4; 3614 } 3615 PetscFunctionReturn(PETSC_SUCCESS); 3616 } 3617 3618 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3619 { 3620 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3621 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3622 const PetscScalar *xb; 3623 PetscScalar x1, x2, x3, x4, x5; 3624 const MatScalar *v, *vv; 3625 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3626 PetscBool usecprow = a->compressedrow.use; 3627 3628 PetscFunctionBegin; 3629 idx = a->j; 3630 v = a->a; 3631 if (usecprow) { 3632 mbs = a->compressedrow.nrows; 3633 ii = a->compressedrow.i; 3634 ridx = a->compressedrow.rindex; 3635 } else { 3636 mbs = a->mbs; 3637 ii = a->i; 3638 z = c; 3639 } 3640 3641 for (i = 0; i < mbs; i++) { 3642 n = ii[1] - ii[0]; 3643 ii++; 3644 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3645 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3646 if (usecprow) z = c + 5 * ridx[i]; 3647 jj = idx; 3648 vv = v; 3649 for (k = 0; k < cn; k++) { 3650 idx = jj; 3651 v = vv; 3652 sum1 = 0.0; 3653 sum2 = 0.0; 3654 sum3 = 0.0; 3655 sum4 = 0.0; 3656 sum5 = 0.0; 3657 for (j = 0; j < n; j++) { 3658 xb = b + 5 * (*idx++); 3659 x1 = xb[0 + k * bm]; 3660 x2 = xb[1 + k * bm]; 3661 x3 = xb[2 + k * bm]; 3662 x4 = xb[3 + k * bm]; 3663 x5 = xb[4 + k * bm]; 3664 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3665 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3666 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3667 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3668 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3669 v += 25; 3670 } 3671 z[0 + k * cm] = sum1; 3672 z[1 + k * cm] = sum2; 3673 z[2 + k * cm] = sum3; 3674 z[3 + k * cm] = sum4; 3675 z[4 + k * cm] = sum5; 3676 } 3677 if (!usecprow) z += 5; 3678 } 3679 PetscFunctionReturn(PETSC_SUCCESS); 3680 } 3681 3682 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3683 { 3684 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3685 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3686 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3687 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3688 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3689 PetscBLASInt bbs, bcn, bbm, bcm; 3690 PetscScalar *z = NULL; 3691 PetscScalar *c, *b; 3692 const MatScalar *v; 3693 const PetscInt *idx, *ii, *ridx = NULL; 3694 PetscScalar _DZero = 0.0, _DOne = 1.0; 3695 PetscBool usecprow = a->compressedrow.use; 3696 3697 PetscFunctionBegin; 3698 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 3699 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3700 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3701 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3702 b = bd->v; 3703 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 3704 PetscCall(MatDenseGetArray(C, &c)); 3705 switch (bs) { 3706 case 1: 3707 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3708 break; 3709 case 2: 3710 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3711 break; 3712 case 3: 3713 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3714 break; 3715 case 4: 3716 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3717 break; 3718 case 5: 3719 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3720 break; 3721 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3722 PetscCall(PetscBLASIntCast(bs, &bbs)); 3723 PetscCall(PetscBLASIntCast(cn, &bcn)); 3724 PetscCall(PetscBLASIntCast(bm, &bbm)); 3725 PetscCall(PetscBLASIntCast(cm, &bcm)); 3726 idx = a->j; 3727 v = a->a; 3728 if (usecprow) { 3729 mbs = a->compressedrow.nrows; 3730 ii = a->compressedrow.i; 3731 ridx = a->compressedrow.rindex; 3732 } else { 3733 mbs = a->mbs; 3734 ii = a->i; 3735 z = c; 3736 } 3737 for (i = 0; i < mbs; i++) { 3738 n = ii[1] - ii[0]; 3739 ii++; 3740 if (usecprow) z = c + bs * ridx[i]; 3741 if (n) { 3742 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3743 v += bs2; 3744 } 3745 for (j = 1; j < n; j++) { 3746 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3747 v += bs2; 3748 } 3749 if (!usecprow) z += bs; 3750 } 3751 } 3752 PetscCall(MatDenseRestoreArray(C, &c)); 3753 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3754 PetscFunctionReturn(PETSC_SUCCESS); 3755 } 3756