1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 15 const PetscInt *idx; 16 PetscInt start, end, *ai, *aj, bs; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m, &table)); 28 PetscCall(PetscMalloc1(m + 1, &nidx)); 29 30 for (i = 0; i < is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 PetscCall(PetscBTMemzero(m, table)); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 PetscCall(ISGetIndices(is[i], &idx)); 37 PetscCall(ISGetLocalSize(is[i], &n)); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j = 0; j < n; ++j) { 41 ival = idx[j] / bs; /* convert the indices into block indices */ 42 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 43 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44 } 45 PetscCall(ISRestoreIndices(is[i], &idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j = 0; j < ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row + 1]; 55 for (l = start; l < end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58 } 59 } 60 } 61 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 62 } 63 PetscCall(PetscBTDestroy(&table)); 64 PetscCall(PetscFree(nidx)); 65 PetscFunctionReturn(PETSC_SUCCESS); 66 } 67 68 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 69 { 70 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 71 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 72 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 73 const PetscInt *irow, *icol; 74 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 75 PetscInt *aj = a->j, *ai = a->i; 76 MatScalar *mat_a; 77 Mat C; 78 PetscBool flag; 79 80 PetscFunctionBegin; 81 PetscCall(ISGetIndices(isrow, &irow)); 82 PetscCall(ISGetIndices(iscol, &icol)); 83 PetscCall(ISGetLocalSize(isrow, &nrows)); 84 PetscCall(ISGetLocalSize(iscol, &ncols)); 85 86 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 87 ssmap = smap; 88 PetscCall(PetscMalloc1(1 + nrows, &lens)); 89 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 90 /* determine lens of each row */ 91 for (i = 0; i < nrows; i++) { 92 kstart = ai[irow[i]]; 93 kend = kstart + a->ilen[irow[i]]; 94 lens[i] = 0; 95 for (k = kstart; k < kend; k++) { 96 if (ssmap[aj[k]]) lens[i]++; 97 } 98 } 99 /* Create and fill new matrix */ 100 if (scall == MAT_REUSE_MATRIX) { 101 c = (Mat_SeqBAIJ *)((*B)->data); 102 103 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 104 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 105 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 106 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 107 C = *B; 108 } else { 109 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 110 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 111 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 112 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 113 } 114 c = (Mat_SeqBAIJ *)(C->data); 115 for (i = 0; i < nrows; i++) { 116 row = irow[i]; 117 kstart = ai[row]; 118 kend = kstart + a->ilen[row]; 119 mat_i = c->i[i]; 120 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 121 mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 122 mat_ilen = c->ilen + i; 123 for (k = kstart; k < kend; k++) { 124 if ((tcol = ssmap[a->j[k]])) { 125 *mat_j++ = tcol - 1; 126 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 127 mat_a += bs2; 128 (*mat_ilen)++; 129 } 130 } 131 } 132 /* sort */ 133 if (c->j && c->a) { 134 MatScalar *work; 135 PetscCall(PetscMalloc1(bs2, &work)); 136 for (i = 0; i < nrows; i++) { 137 PetscInt ilen; 138 mat_i = c->i[i]; 139 mat_j = c->j + mat_i; 140 mat_a = c->a + mat_i * bs2; 141 ilen = c->ilen[i]; 142 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 143 } 144 PetscCall(PetscFree(work)); 145 } 146 147 /* Free work space */ 148 PetscCall(ISRestoreIndices(iscol, &icol)); 149 PetscCall(PetscFree(smap)); 150 PetscCall(PetscFree(lens)); 151 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 152 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 153 154 PetscCall(ISRestoreIndices(isrow, &irow)); 155 *B = C; 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 160 { 161 IS is1, is2; 162 163 PetscFunctionBegin; 164 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 165 if (isrow == iscol) { 166 is2 = is1; 167 PetscCall(PetscObjectReference((PetscObject)is2)); 168 } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 169 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 170 PetscCall(ISDestroy(&is1)); 171 PetscCall(ISDestroy(&is2)); 172 PetscFunctionReturn(PETSC_SUCCESS); 173 } 174 175 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 176 { 177 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 178 Mat_SubSppt *submatj = c->submatis1; 179 180 PetscFunctionBegin; 181 PetscCall((*submatj->destroy)(C)); 182 PetscCall(MatDestroySubMatrix_Private(submatj)); 183 PetscFunctionReturn(PETSC_SUCCESS); 184 } 185 186 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 187 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 188 { 189 PetscInt i; 190 Mat C; 191 Mat_SeqBAIJ *c; 192 Mat_SubSppt *submatj; 193 194 PetscFunctionBegin; 195 for (i = 0; i < n; i++) { 196 C = (*mat)[i]; 197 c = (Mat_SeqBAIJ *)C->data; 198 submatj = c->submatis1; 199 if (submatj) { 200 if (--((PetscObject)C)->refct <= 0) { 201 PetscCall(PetscFree(C->factorprefix)); 202 PetscCall((*submatj->destroy)(C)); 203 PetscCall(MatDestroySubMatrix_Private(submatj)); 204 PetscCall(PetscFree(C->defaultvectype)); 205 PetscCall(PetscFree(C->defaultrandtype)); 206 PetscCall(PetscLayoutDestroy(&C->rmap)); 207 PetscCall(PetscLayoutDestroy(&C->cmap)); 208 PetscCall(PetscHeaderDestroy(&C)); 209 } 210 } else { 211 PetscCall(MatDestroy(&C)); 212 } 213 } 214 215 /* Destroy Dummy submatrices created for reuse */ 216 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 217 218 PetscCall(PetscFree(*mat)); 219 PetscFunctionReturn(PETSC_SUCCESS); 220 } 221 222 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 223 { 224 PetscInt i; 225 226 PetscFunctionBegin; 227 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 228 229 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 /* Should check that shapes of vectors and matrices match */ 234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 235 { 236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 237 PetscScalar *z, sum; 238 const PetscScalar *x; 239 const MatScalar *v; 240 PetscInt mbs, i, n; 241 const PetscInt *idx, *ii, *ridx = NULL; 242 PetscBool usecprow = a->compressedrow.use; 243 244 PetscFunctionBegin; 245 PetscCall(VecGetArrayRead(xx, &x)); 246 PetscCall(VecGetArrayWrite(zz, &z)); 247 248 if (usecprow) { 249 mbs = a->compressedrow.nrows; 250 ii = a->compressedrow.i; 251 ridx = a->compressedrow.rindex; 252 PetscCall(PetscArrayzero(z, a->mbs)); 253 } else { 254 mbs = a->mbs; 255 ii = a->i; 256 } 257 258 for (i = 0; i < mbs; i++) { 259 n = ii[1] - ii[0]; 260 v = a->a + ii[0]; 261 idx = a->j + ii[0]; 262 ii++; 263 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 264 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 265 sum = 0.0; 266 PetscSparseDensePlusDot(sum, x, v, idx, n); 267 if (usecprow) { 268 z[ridx[i]] = sum; 269 } else { 270 z[i] = sum; 271 } 272 } 273 PetscCall(VecRestoreArrayRead(xx, &x)); 274 PetscCall(VecRestoreArrayWrite(zz, &z)); 275 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 276 PetscFunctionReturn(PETSC_SUCCESS); 277 } 278 279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 282 PetscScalar *z = NULL, sum1, sum2, *zarray; 283 const PetscScalar *x, *xb; 284 PetscScalar x1, x2; 285 const MatScalar *v; 286 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 287 PetscBool usecprow = a->compressedrow.use; 288 289 PetscFunctionBegin; 290 PetscCall(VecGetArrayRead(xx, &x)); 291 PetscCall(VecGetArrayWrite(zz, &zarray)); 292 293 idx = a->j; 294 v = a->a; 295 if (usecprow) { 296 mbs = a->compressedrow.nrows; 297 ii = a->compressedrow.i; 298 ridx = a->compressedrow.rindex; 299 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 300 } else { 301 mbs = a->mbs; 302 ii = a->i; 303 z = zarray; 304 } 305 306 for (i = 0; i < mbs; i++) { 307 n = ii[1] - ii[0]; 308 ii++; 309 sum1 = 0.0; 310 sum2 = 0.0; 311 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 312 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 313 for (j = 0; j < n; j++) { 314 xb = x + 2 * (*idx++); 315 x1 = xb[0]; 316 x2 = xb[1]; 317 sum1 += v[0] * x1 + v[2] * x2; 318 sum2 += v[1] * x1 + v[3] * x2; 319 v += 4; 320 } 321 if (usecprow) z = zarray + 2 * ridx[i]; 322 z[0] = sum1; 323 z[1] = sum2; 324 if (!usecprow) z += 2; 325 } 326 PetscCall(VecRestoreArrayRead(xx, &x)); 327 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 328 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 329 PetscFunctionReturn(PETSC_SUCCESS); 330 } 331 332 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 333 { 334 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 335 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 336 const PetscScalar *x, *xb; 337 const MatScalar *v; 338 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 339 PetscBool usecprow = a->compressedrow.use; 340 341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 342 #pragma disjoint(*v, *z, *xb) 343 #endif 344 345 PetscFunctionBegin; 346 PetscCall(VecGetArrayRead(xx, &x)); 347 PetscCall(VecGetArrayWrite(zz, &zarray)); 348 349 idx = a->j; 350 v = a->a; 351 if (usecprow) { 352 mbs = a->compressedrow.nrows; 353 ii = a->compressedrow.i; 354 ridx = a->compressedrow.rindex; 355 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 356 } else { 357 mbs = a->mbs; 358 ii = a->i; 359 z = zarray; 360 } 361 362 for (i = 0; i < mbs; i++) { 363 n = ii[1] - ii[0]; 364 ii++; 365 sum1 = 0.0; 366 sum2 = 0.0; 367 sum3 = 0.0; 368 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 369 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 370 for (j = 0; j < n; j++) { 371 xb = x + 3 * (*idx++); 372 x1 = xb[0]; 373 x2 = xb[1]; 374 x3 = xb[2]; 375 376 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 377 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 378 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 379 v += 9; 380 } 381 if (usecprow) z = zarray + 3 * ridx[i]; 382 z[0] = sum1; 383 z[1] = sum2; 384 z[2] = sum3; 385 if (!usecprow) z += 3; 386 } 387 PetscCall(VecRestoreArrayRead(xx, &x)); 388 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 389 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 394 { 395 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 396 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 397 const PetscScalar *x, *xb; 398 const MatScalar *v; 399 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 400 PetscBool usecprow = a->compressedrow.use; 401 402 PetscFunctionBegin; 403 PetscCall(VecGetArrayRead(xx, &x)); 404 PetscCall(VecGetArrayWrite(zz, &zarray)); 405 406 idx = a->j; 407 v = a->a; 408 if (usecprow) { 409 mbs = a->compressedrow.nrows; 410 ii = a->compressedrow.i; 411 ridx = a->compressedrow.rindex; 412 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 413 } else { 414 mbs = a->mbs; 415 ii = a->i; 416 z = zarray; 417 } 418 419 for (i = 0; i < mbs; i++) { 420 n = ii[1] - ii[0]; 421 ii++; 422 sum1 = 0.0; 423 sum2 = 0.0; 424 sum3 = 0.0; 425 sum4 = 0.0; 426 427 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 428 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 429 for (j = 0; j < n; j++) { 430 xb = x + 4 * (*idx++); 431 x1 = xb[0]; 432 x2 = xb[1]; 433 x3 = xb[2]; 434 x4 = xb[3]; 435 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 436 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 437 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 438 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 439 v += 16; 440 } 441 if (usecprow) z = zarray + 4 * ridx[i]; 442 z[0] = sum1; 443 z[1] = sum2; 444 z[2] = sum3; 445 z[3] = sum4; 446 if (!usecprow) z += 4; 447 } 448 PetscCall(VecRestoreArrayRead(xx, &x)); 449 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 450 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 451 PetscFunctionReturn(PETSC_SUCCESS); 452 } 453 454 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 455 { 456 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 457 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 458 const PetscScalar *xb, *x; 459 const MatScalar *v; 460 const PetscInt *idx, *ii, *ridx = NULL; 461 PetscInt mbs, i, j, n; 462 PetscBool usecprow = a->compressedrow.use; 463 464 PetscFunctionBegin; 465 PetscCall(VecGetArrayRead(xx, &x)); 466 PetscCall(VecGetArrayWrite(zz, &zarray)); 467 468 idx = a->j; 469 v = a->a; 470 if (usecprow) { 471 mbs = a->compressedrow.nrows; 472 ii = a->compressedrow.i; 473 ridx = a->compressedrow.rindex; 474 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 475 } else { 476 mbs = a->mbs; 477 ii = a->i; 478 z = zarray; 479 } 480 481 for (i = 0; i < mbs; i++) { 482 n = ii[1] - ii[0]; 483 ii++; 484 sum1 = 0.0; 485 sum2 = 0.0; 486 sum3 = 0.0; 487 sum4 = 0.0; 488 sum5 = 0.0; 489 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 490 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 491 for (j = 0; j < n; j++) { 492 xb = x + 5 * (*idx++); 493 x1 = xb[0]; 494 x2 = xb[1]; 495 x3 = xb[2]; 496 x4 = xb[3]; 497 x5 = xb[4]; 498 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 499 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 500 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 501 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 502 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 503 v += 25; 504 } 505 if (usecprow) z = zarray + 5 * ridx[i]; 506 z[0] = sum1; 507 z[1] = sum2; 508 z[2] = sum3; 509 z[3] = sum4; 510 z[4] = sum5; 511 if (!usecprow) z += 5; 512 } 513 PetscCall(VecRestoreArrayRead(xx, &x)); 514 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 515 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 516 PetscFunctionReturn(PETSC_SUCCESS); 517 } 518 519 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 520 { 521 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 522 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 523 const PetscScalar *x, *xb; 524 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 525 const MatScalar *v; 526 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 527 PetscBool usecprow = a->compressedrow.use; 528 529 PetscFunctionBegin; 530 PetscCall(VecGetArrayRead(xx, &x)); 531 PetscCall(VecGetArrayWrite(zz, &zarray)); 532 533 idx = a->j; 534 v = a->a; 535 if (usecprow) { 536 mbs = a->compressedrow.nrows; 537 ii = a->compressedrow.i; 538 ridx = a->compressedrow.rindex; 539 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 540 } else { 541 mbs = a->mbs; 542 ii = a->i; 543 z = zarray; 544 } 545 546 for (i = 0; i < mbs; i++) { 547 n = ii[1] - ii[0]; 548 ii++; 549 sum1 = 0.0; 550 sum2 = 0.0; 551 sum3 = 0.0; 552 sum4 = 0.0; 553 sum5 = 0.0; 554 sum6 = 0.0; 555 556 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 557 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 558 for (j = 0; j < n; j++) { 559 xb = x + 6 * (*idx++); 560 x1 = xb[0]; 561 x2 = xb[1]; 562 x3 = xb[2]; 563 x4 = xb[3]; 564 x5 = xb[4]; 565 x6 = xb[5]; 566 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 567 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 568 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 569 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 570 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 571 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 572 v += 36; 573 } 574 if (usecprow) z = zarray + 6 * ridx[i]; 575 z[0] = sum1; 576 z[1] = sum2; 577 z[2] = sum3; 578 z[3] = sum4; 579 z[4] = sum5; 580 z[5] = sum6; 581 if (!usecprow) z += 6; 582 } 583 584 PetscCall(VecRestoreArrayRead(xx, &x)); 585 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 586 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 587 PetscFunctionReturn(PETSC_SUCCESS); 588 } 589 590 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 591 { 592 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 593 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 594 const PetscScalar *x, *xb; 595 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 596 const MatScalar *v; 597 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 598 PetscBool usecprow = a->compressedrow.use; 599 600 PetscFunctionBegin; 601 PetscCall(VecGetArrayRead(xx, &x)); 602 PetscCall(VecGetArrayWrite(zz, &zarray)); 603 604 idx = a->j; 605 v = a->a; 606 if (usecprow) { 607 mbs = a->compressedrow.nrows; 608 ii = a->compressedrow.i; 609 ridx = a->compressedrow.rindex; 610 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 611 } else { 612 mbs = a->mbs; 613 ii = a->i; 614 z = zarray; 615 } 616 617 for (i = 0; i < mbs; i++) { 618 n = ii[1] - ii[0]; 619 ii++; 620 sum1 = 0.0; 621 sum2 = 0.0; 622 sum3 = 0.0; 623 sum4 = 0.0; 624 sum5 = 0.0; 625 sum6 = 0.0; 626 sum7 = 0.0; 627 628 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 629 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 630 for (j = 0; j < n; j++) { 631 xb = x + 7 * (*idx++); 632 x1 = xb[0]; 633 x2 = xb[1]; 634 x3 = xb[2]; 635 x4 = xb[3]; 636 x5 = xb[4]; 637 x6 = xb[5]; 638 x7 = xb[6]; 639 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 640 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 641 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 642 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 643 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 644 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 645 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 646 v += 49; 647 } 648 if (usecprow) z = zarray + 7 * ridx[i]; 649 z[0] = sum1; 650 z[1] = sum2; 651 z[2] = sum3; 652 z[3] = sum4; 653 z[4] = sum5; 654 z[5] = sum6; 655 z[6] = sum7; 656 if (!usecprow) z += 7; 657 } 658 659 PetscCall(VecRestoreArrayRead(xx, &x)); 660 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 661 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 662 PetscFunctionReturn(PETSC_SUCCESS); 663 } 664 665 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 666 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 667 { 668 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 669 PetscScalar *z = NULL, *work, *workt, *zarray; 670 const PetscScalar *x, *xb; 671 const MatScalar *v; 672 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 673 const PetscInt *idx, *ii, *ridx = NULL; 674 PetscInt k; 675 PetscBool usecprow = a->compressedrow.use; 676 677 __m256d a0, a1, a2, a3, a4, a5; 678 __m256d w0, w1, w2, w3; 679 __m256d z0, z1, z2; 680 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 681 682 PetscFunctionBegin; 683 PetscCall(VecGetArrayRead(xx, &x)); 684 PetscCall(VecGetArrayWrite(zz, &zarray)); 685 686 idx = a->j; 687 v = a->a; 688 if (usecprow) { 689 mbs = a->compressedrow.nrows; 690 ii = a->compressedrow.i; 691 ridx = a->compressedrow.rindex; 692 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 693 } else { 694 mbs = a->mbs; 695 ii = a->i; 696 z = zarray; 697 } 698 699 if (!a->mult_work) { 700 k = PetscMax(A->rmap->n, A->cmap->n); 701 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 702 } 703 704 work = a->mult_work; 705 for (i = 0; i < mbs; i++) { 706 n = ii[1] - ii[0]; 707 ii++; 708 workt = work; 709 for (j = 0; j < n; j++) { 710 xb = x + bs * (*idx++); 711 for (k = 0; k < bs; k++) workt[k] = xb[k]; 712 workt += bs; 713 } 714 if (usecprow) z = zarray + bs * ridx[i]; 715 716 z0 = _mm256_setzero_pd(); 717 z1 = _mm256_setzero_pd(); 718 z2 = _mm256_setzero_pd(); 719 720 for (j = 0; j < n; j++) { 721 /* first column of a */ 722 w0 = _mm256_set1_pd(work[j * 9]); 723 a0 = _mm256_loadu_pd(&v[j * 81]); 724 z0 = _mm256_fmadd_pd(a0, w0, z0); 725 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 726 z1 = _mm256_fmadd_pd(a1, w0, z1); 727 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 728 z2 = _mm256_fmadd_pd(a2, w0, z2); 729 730 /* second column of a */ 731 w1 = _mm256_set1_pd(work[j * 9 + 1]); 732 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 733 z0 = _mm256_fmadd_pd(a0, w1, z0); 734 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 735 z1 = _mm256_fmadd_pd(a1, w1, z1); 736 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 737 z2 = _mm256_fmadd_pd(a2, w1, z2); 738 739 /* third column of a */ 740 w2 = _mm256_set1_pd(work[j * 9 + 2]); 741 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 742 z0 = _mm256_fmadd_pd(a3, w2, z0); 743 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 744 z1 = _mm256_fmadd_pd(a4, w2, z1); 745 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 746 z2 = _mm256_fmadd_pd(a5, w2, z2); 747 748 /* fourth column of a */ 749 w3 = _mm256_set1_pd(work[j * 9 + 3]); 750 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 751 z0 = _mm256_fmadd_pd(a0, w3, z0); 752 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 753 z1 = _mm256_fmadd_pd(a1, w3, z1); 754 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 755 z2 = _mm256_fmadd_pd(a2, w3, z2); 756 757 /* fifth column of a */ 758 w0 = _mm256_set1_pd(work[j * 9 + 4]); 759 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 760 z0 = _mm256_fmadd_pd(a3, w0, z0); 761 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 762 z1 = _mm256_fmadd_pd(a4, w0, z1); 763 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 764 z2 = _mm256_fmadd_pd(a5, w0, z2); 765 766 /* sixth column of a */ 767 w1 = _mm256_set1_pd(work[j * 9 + 5]); 768 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 769 z0 = _mm256_fmadd_pd(a0, w1, z0); 770 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 771 z1 = _mm256_fmadd_pd(a1, w1, z1); 772 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 773 z2 = _mm256_fmadd_pd(a2, w1, z2); 774 775 /* seventh column of a */ 776 w2 = _mm256_set1_pd(work[j * 9 + 6]); 777 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 778 z0 = _mm256_fmadd_pd(a0, w2, z0); 779 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 780 z1 = _mm256_fmadd_pd(a1, w2, z1); 781 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 782 z2 = _mm256_fmadd_pd(a2, w2, z2); 783 784 /* eighth column of a */ 785 w3 = _mm256_set1_pd(work[j * 9 + 7]); 786 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 787 z0 = _mm256_fmadd_pd(a3, w3, z0); 788 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 789 z1 = _mm256_fmadd_pd(a4, w3, z1); 790 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 791 z2 = _mm256_fmadd_pd(a5, w3, z2); 792 793 /* ninth column of a */ 794 w0 = _mm256_set1_pd(work[j * 9 + 8]); 795 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 796 z0 = _mm256_fmadd_pd(a0, w0, z0); 797 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 798 z1 = _mm256_fmadd_pd(a1, w0, z1); 799 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 800 z2 = _mm256_fmadd_pd(a2, w0, z2); 801 } 802 803 _mm256_storeu_pd(&z[0], z0); 804 _mm256_storeu_pd(&z[4], z1); 805 _mm256_maskstore_pd(&z[8], mask1, z2); 806 807 v += n * bs2; 808 if (!usecprow) z += bs; 809 } 810 PetscCall(VecRestoreArrayRead(xx, &x)); 811 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 812 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 813 PetscFunctionReturn(PETSC_SUCCESS); 814 } 815 #endif 816 817 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 818 { 819 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 820 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 821 const PetscScalar *x, *xb; 822 PetscScalar *zarray, xv; 823 const MatScalar *v; 824 const PetscInt *ii, *ij = a->j, *idx; 825 PetscInt mbs, i, j, k, n, *ridx = NULL; 826 PetscBool usecprow = a->compressedrow.use; 827 828 PetscFunctionBegin; 829 PetscCall(VecGetArrayRead(xx, &x)); 830 PetscCall(VecGetArrayWrite(zz, &zarray)); 831 832 v = a->a; 833 if (usecprow) { 834 mbs = a->compressedrow.nrows; 835 ii = a->compressedrow.i; 836 ridx = a->compressedrow.rindex; 837 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 838 } else { 839 mbs = a->mbs; 840 ii = a->i; 841 z = zarray; 842 } 843 844 for (i = 0; i < mbs; i++) { 845 n = ii[i + 1] - ii[i]; 846 idx = ij + ii[i]; 847 sum1 = 0.0; 848 sum2 = 0.0; 849 sum3 = 0.0; 850 sum4 = 0.0; 851 sum5 = 0.0; 852 sum6 = 0.0; 853 sum7 = 0.0; 854 sum8 = 0.0; 855 sum9 = 0.0; 856 sum10 = 0.0; 857 sum11 = 0.0; 858 859 for (j = 0; j < n; j++) { 860 xb = x + 11 * (idx[j]); 861 862 for (k = 0; k < 11; k++) { 863 xv = xb[k]; 864 sum1 += v[0] * xv; 865 sum2 += v[1] * xv; 866 sum3 += v[2] * xv; 867 sum4 += v[3] * xv; 868 sum5 += v[4] * xv; 869 sum6 += v[5] * xv; 870 sum7 += v[6] * xv; 871 sum8 += v[7] * xv; 872 sum9 += v[8] * xv; 873 sum10 += v[9] * xv; 874 sum11 += v[10] * xv; 875 v += 11; 876 } 877 } 878 if (usecprow) z = zarray + 11 * ridx[i]; 879 z[0] = sum1; 880 z[1] = sum2; 881 z[2] = sum3; 882 z[3] = sum4; 883 z[4] = sum5; 884 z[5] = sum6; 885 z[6] = sum7; 886 z[7] = sum8; 887 z[8] = sum9; 888 z[9] = sum10; 889 z[10] = sum11; 890 891 if (!usecprow) z += 11; 892 } 893 894 PetscCall(VecRestoreArrayRead(xx, &x)); 895 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 896 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 897 PetscFunctionReturn(PETSC_SUCCESS); 898 } 899 900 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 901 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 902 { 903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 904 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 905 const PetscScalar *x, *xb; 906 PetscScalar *zarray, xv; 907 const MatScalar *v; 908 const PetscInt *ii, *ij = a->j, *idx; 909 PetscInt mbs, i, j, k, n, *ridx = NULL; 910 PetscBool usecprow = a->compressedrow.use; 911 912 PetscFunctionBegin; 913 PetscCall(VecGetArrayRead(xx, &x)); 914 PetscCall(VecGetArrayWrite(zz, &zarray)); 915 916 v = a->a; 917 if (usecprow) { 918 mbs = a->compressedrow.nrows; 919 ii = a->compressedrow.i; 920 ridx = a->compressedrow.rindex; 921 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 922 } else { 923 mbs = a->mbs; 924 ii = a->i; 925 z = zarray; 926 } 927 928 for (i = 0; i < mbs; i++) { 929 n = ii[i + 1] - ii[i]; 930 idx = ij + ii[i]; 931 sum1 = 0.0; 932 sum2 = 0.0; 933 sum3 = 0.0; 934 sum4 = 0.0; 935 sum5 = 0.0; 936 sum6 = 0.0; 937 sum7 = 0.0; 938 sum8 = 0.0; 939 sum9 = 0.0; 940 sum10 = 0.0; 941 sum11 = 0.0; 942 sum12 = 0.0; 943 944 for (j = 0; j < n; j++) { 945 xb = x + 12 * (idx[j]); 946 947 for (k = 0; k < 12; k++) { 948 xv = xb[k]; 949 sum1 += v[0] * xv; 950 sum2 += v[1] * xv; 951 sum3 += v[2] * xv; 952 sum4 += v[3] * xv; 953 sum5 += v[4] * xv; 954 sum6 += v[5] * xv; 955 sum7 += v[6] * xv; 956 sum8 += v[7] * xv; 957 sum9 += v[8] * xv; 958 sum10 += v[9] * xv; 959 sum11 += v[10] * xv; 960 sum12 += v[11] * xv; 961 v += 12; 962 } 963 } 964 if (usecprow) z = zarray + 12 * ridx[i]; 965 z[0] = sum1; 966 z[1] = sum2; 967 z[2] = sum3; 968 z[3] = sum4; 969 z[4] = sum5; 970 z[5] = sum6; 971 z[6] = sum7; 972 z[7] = sum8; 973 z[8] = sum9; 974 z[9] = sum10; 975 z[10] = sum11; 976 z[11] = sum12; 977 if (!usecprow) z += 12; 978 } 979 PetscCall(VecRestoreArrayRead(xx, &x)); 980 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 981 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 982 PetscFunctionReturn(PETSC_SUCCESS); 983 } 984 985 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 986 { 987 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 988 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 989 const PetscScalar *x, *xb; 990 PetscScalar *zarray, *yarray, xv; 991 const MatScalar *v; 992 const PetscInt *ii, *ij = a->j, *idx; 993 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 994 PetscBool usecprow = a->compressedrow.use; 995 996 PetscFunctionBegin; 997 PetscCall(VecGetArrayRead(xx, &x)); 998 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 999 1000 v = a->a; 1001 if (usecprow) { 1002 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1003 mbs = a->compressedrow.nrows; 1004 ii = a->compressedrow.i; 1005 ridx = a->compressedrow.rindex; 1006 } else { 1007 ii = a->i; 1008 y = yarray; 1009 z = zarray; 1010 } 1011 1012 for (i = 0; i < mbs; i++) { 1013 n = ii[i + 1] - ii[i]; 1014 idx = ij + ii[i]; 1015 1016 if (usecprow) { 1017 y = yarray + 12 * ridx[i]; 1018 z = zarray + 12 * ridx[i]; 1019 } 1020 sum1 = y[0]; 1021 sum2 = y[1]; 1022 sum3 = y[2]; 1023 sum4 = y[3]; 1024 sum5 = y[4]; 1025 sum6 = y[5]; 1026 sum7 = y[6]; 1027 sum8 = y[7]; 1028 sum9 = y[8]; 1029 sum10 = y[9]; 1030 sum11 = y[10]; 1031 sum12 = y[11]; 1032 1033 for (j = 0; j < n; j++) { 1034 xb = x + 12 * (idx[j]); 1035 1036 for (k = 0; k < 12; k++) { 1037 xv = xb[k]; 1038 sum1 += v[0] * xv; 1039 sum2 += v[1] * xv; 1040 sum3 += v[2] * xv; 1041 sum4 += v[3] * xv; 1042 sum5 += v[4] * xv; 1043 sum6 += v[5] * xv; 1044 sum7 += v[6] * xv; 1045 sum8 += v[7] * xv; 1046 sum9 += v[8] * xv; 1047 sum10 += v[9] * xv; 1048 sum11 += v[10] * xv; 1049 sum12 += v[11] * xv; 1050 v += 12; 1051 } 1052 } 1053 1054 z[0] = sum1; 1055 z[1] = sum2; 1056 z[2] = sum3; 1057 z[3] = sum4; 1058 z[4] = sum5; 1059 z[5] = sum6; 1060 z[6] = sum7; 1061 z[7] = sum8; 1062 z[8] = sum9; 1063 z[9] = sum10; 1064 z[10] = sum11; 1065 z[11] = sum12; 1066 if (!usecprow) { 1067 y += 12; 1068 z += 12; 1069 } 1070 } 1071 PetscCall(VecRestoreArrayRead(xx, &x)); 1072 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1073 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1074 PetscFunctionReturn(PETSC_SUCCESS); 1075 } 1076 1077 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1078 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1079 { 1080 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1081 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1082 const PetscScalar *x, *xb; 1083 PetscScalar x1, x2, x3, x4, *zarray; 1084 const MatScalar *v; 1085 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1086 PetscInt mbs, i, j, n; 1087 PetscBool usecprow = a->compressedrow.use; 1088 1089 PetscFunctionBegin; 1090 PetscCall(VecGetArrayRead(xx, &x)); 1091 PetscCall(VecGetArrayWrite(zz, &zarray)); 1092 1093 v = a->a; 1094 if (usecprow) { 1095 mbs = a->compressedrow.nrows; 1096 ii = a->compressedrow.i; 1097 ridx = a->compressedrow.rindex; 1098 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1099 } else { 1100 mbs = a->mbs; 1101 ii = a->i; 1102 z = zarray; 1103 } 1104 1105 for (i = 0; i < mbs; i++) { 1106 n = ii[i + 1] - ii[i]; 1107 idx = ij + ii[i]; 1108 1109 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1110 for (j = 0; j < n; j++) { 1111 xb = x + 12 * (idx[j]); 1112 x1 = xb[0]; 1113 x2 = xb[1]; 1114 x3 = xb[2]; 1115 x4 = xb[3]; 1116 1117 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1118 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1119 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1120 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1121 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1122 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1123 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1124 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1125 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1126 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1127 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1128 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1129 v += 48; 1130 1131 x1 = xb[4]; 1132 x2 = xb[5]; 1133 x3 = xb[6]; 1134 x4 = xb[7]; 1135 1136 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1137 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1138 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1139 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1140 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1141 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1142 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1143 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1144 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1145 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1146 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1147 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1148 v += 48; 1149 1150 x1 = xb[8]; 1151 x2 = xb[9]; 1152 x3 = xb[10]; 1153 x4 = xb[11]; 1154 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1155 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1156 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1157 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1158 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1159 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1160 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1161 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1162 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1163 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1164 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1165 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1166 v += 48; 1167 } 1168 if (usecprow) z = zarray + 12 * ridx[i]; 1169 z[0] = sum1; 1170 z[1] = sum2; 1171 z[2] = sum3; 1172 z[3] = sum4; 1173 z[4] = sum5; 1174 z[5] = sum6; 1175 z[6] = sum7; 1176 z[7] = sum8; 1177 z[8] = sum9; 1178 z[9] = sum10; 1179 z[10] = sum11; 1180 z[11] = sum12; 1181 if (!usecprow) z += 12; 1182 } 1183 PetscCall(VecRestoreArrayRead(xx, &x)); 1184 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1185 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1186 PetscFunctionReturn(PETSC_SUCCESS); 1187 } 1188 1189 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1190 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1191 { 1192 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1193 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1194 const PetscScalar *x, *xb; 1195 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1196 const MatScalar *v; 1197 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1198 PetscInt mbs = a->mbs, i, j, n; 1199 PetscBool usecprow = a->compressedrow.use; 1200 1201 PetscFunctionBegin; 1202 PetscCall(VecGetArrayRead(xx, &x)); 1203 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1204 1205 v = a->a; 1206 if (usecprow) { 1207 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1208 mbs = a->compressedrow.nrows; 1209 ii = a->compressedrow.i; 1210 ridx = a->compressedrow.rindex; 1211 } else { 1212 ii = a->i; 1213 y = yarray; 1214 z = zarray; 1215 } 1216 1217 for (i = 0; i < mbs; i++) { 1218 n = ii[i + 1] - ii[i]; 1219 idx = ij + ii[i]; 1220 1221 if (usecprow) { 1222 y = yarray + 12 * ridx[i]; 1223 z = zarray + 12 * ridx[i]; 1224 } 1225 sum1 = y[0]; 1226 sum2 = y[1]; 1227 sum3 = y[2]; 1228 sum4 = y[3]; 1229 sum5 = y[4]; 1230 sum6 = y[5]; 1231 sum7 = y[6]; 1232 sum8 = y[7]; 1233 sum9 = y[8]; 1234 sum10 = y[9]; 1235 sum11 = y[10]; 1236 sum12 = y[11]; 1237 1238 for (j = 0; j < n; j++) { 1239 xb = x + 12 * (idx[j]); 1240 x1 = xb[0]; 1241 x2 = xb[1]; 1242 x3 = xb[2]; 1243 x4 = xb[3]; 1244 1245 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1246 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1247 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1248 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1249 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1250 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1251 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1252 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1253 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1254 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1255 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1256 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1257 v += 48; 1258 1259 x1 = xb[4]; 1260 x2 = xb[5]; 1261 x3 = xb[6]; 1262 x4 = xb[7]; 1263 1264 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1265 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1266 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1267 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1268 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1269 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1270 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1271 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1272 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1273 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1274 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1275 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1276 v += 48; 1277 1278 x1 = xb[8]; 1279 x2 = xb[9]; 1280 x3 = xb[10]; 1281 x4 = xb[11]; 1282 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1283 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1284 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1285 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1286 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1287 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1288 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1289 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1290 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1291 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1292 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1293 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1294 v += 48; 1295 } 1296 z[0] = sum1; 1297 z[1] = sum2; 1298 z[2] = sum3; 1299 z[3] = sum4; 1300 z[4] = sum5; 1301 z[5] = sum6; 1302 z[6] = sum7; 1303 z[7] = sum8; 1304 z[8] = sum9; 1305 z[9] = sum10; 1306 z[10] = sum11; 1307 z[11] = sum12; 1308 if (!usecprow) { 1309 y += 12; 1310 z += 12; 1311 } 1312 } 1313 PetscCall(VecRestoreArrayRead(xx, &x)); 1314 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1315 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1316 PetscFunctionReturn(PETSC_SUCCESS); 1317 } 1318 1319 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1320 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1321 { 1322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1323 PetscScalar *z = NULL, *zarray; 1324 const PetscScalar *x, *work; 1325 const MatScalar *v = a->a; 1326 PetscInt mbs, i, j, n; 1327 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1328 PetscBool usecprow = a->compressedrow.use; 1329 const PetscInt bs = 12, bs2 = 144; 1330 1331 __m256d a0, a1, a2, a3, a4, a5; 1332 __m256d w0, w1, w2, w3; 1333 __m256d z0, z1, z2; 1334 1335 PetscFunctionBegin; 1336 PetscCall(VecGetArrayRead(xx, &x)); 1337 PetscCall(VecGetArrayWrite(zz, &zarray)); 1338 1339 if (usecprow) { 1340 mbs = a->compressedrow.nrows; 1341 ii = a->compressedrow.i; 1342 ridx = a->compressedrow.rindex; 1343 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1344 } else { 1345 mbs = a->mbs; 1346 ii = a->i; 1347 z = zarray; 1348 } 1349 1350 for (i = 0; i < mbs; i++) { 1351 z0 = _mm256_setzero_pd(); 1352 z1 = _mm256_setzero_pd(); 1353 z2 = _mm256_setzero_pd(); 1354 1355 n = ii[1] - ii[0]; 1356 ii++; 1357 for (j = 0; j < n; j++) { 1358 work = x + bs * (*idx++); 1359 1360 /* first column of a */ 1361 w0 = _mm256_set1_pd(work[0]); 1362 a0 = _mm256_loadu_pd(v + 0); 1363 z0 = _mm256_fmadd_pd(a0, w0, z0); 1364 a1 = _mm256_loadu_pd(v + 4); 1365 z1 = _mm256_fmadd_pd(a1, w0, z1); 1366 a2 = _mm256_loadu_pd(v + 8); 1367 z2 = _mm256_fmadd_pd(a2, w0, z2); 1368 1369 /* second column of a */ 1370 w1 = _mm256_set1_pd(work[1]); 1371 a3 = _mm256_loadu_pd(v + 12); 1372 z0 = _mm256_fmadd_pd(a3, w1, z0); 1373 a4 = _mm256_loadu_pd(v + 16); 1374 z1 = _mm256_fmadd_pd(a4, w1, z1); 1375 a5 = _mm256_loadu_pd(v + 20); 1376 z2 = _mm256_fmadd_pd(a5, w1, z2); 1377 1378 /* third column of a */ 1379 w2 = _mm256_set1_pd(work[2]); 1380 a0 = _mm256_loadu_pd(v + 24); 1381 z0 = _mm256_fmadd_pd(a0, w2, z0); 1382 a1 = _mm256_loadu_pd(v + 28); 1383 z1 = _mm256_fmadd_pd(a1, w2, z1); 1384 a2 = _mm256_loadu_pd(v + 32); 1385 z2 = _mm256_fmadd_pd(a2, w2, z2); 1386 1387 /* fourth column of a */ 1388 w3 = _mm256_set1_pd(work[3]); 1389 a3 = _mm256_loadu_pd(v + 36); 1390 z0 = _mm256_fmadd_pd(a3, w3, z0); 1391 a4 = _mm256_loadu_pd(v + 40); 1392 z1 = _mm256_fmadd_pd(a4, w3, z1); 1393 a5 = _mm256_loadu_pd(v + 44); 1394 z2 = _mm256_fmadd_pd(a5, w3, z2); 1395 1396 /* fifth column of a */ 1397 w0 = _mm256_set1_pd(work[4]); 1398 a0 = _mm256_loadu_pd(v + 48); 1399 z0 = _mm256_fmadd_pd(a0, w0, z0); 1400 a1 = _mm256_loadu_pd(v + 52); 1401 z1 = _mm256_fmadd_pd(a1, w0, z1); 1402 a2 = _mm256_loadu_pd(v + 56); 1403 z2 = _mm256_fmadd_pd(a2, w0, z2); 1404 1405 /* sixth column of a */ 1406 w1 = _mm256_set1_pd(work[5]); 1407 a3 = _mm256_loadu_pd(v + 60); 1408 z0 = _mm256_fmadd_pd(a3, w1, z0); 1409 a4 = _mm256_loadu_pd(v + 64); 1410 z1 = _mm256_fmadd_pd(a4, w1, z1); 1411 a5 = _mm256_loadu_pd(v + 68); 1412 z2 = _mm256_fmadd_pd(a5, w1, z2); 1413 1414 /* seventh column of a */ 1415 w2 = _mm256_set1_pd(work[6]); 1416 a0 = _mm256_loadu_pd(v + 72); 1417 z0 = _mm256_fmadd_pd(a0, w2, z0); 1418 a1 = _mm256_loadu_pd(v + 76); 1419 z1 = _mm256_fmadd_pd(a1, w2, z1); 1420 a2 = _mm256_loadu_pd(v + 80); 1421 z2 = _mm256_fmadd_pd(a2, w2, z2); 1422 1423 /* eighth column of a */ 1424 w3 = _mm256_set1_pd(work[7]); 1425 a3 = _mm256_loadu_pd(v + 84); 1426 z0 = _mm256_fmadd_pd(a3, w3, z0); 1427 a4 = _mm256_loadu_pd(v + 88); 1428 z1 = _mm256_fmadd_pd(a4, w3, z1); 1429 a5 = _mm256_loadu_pd(v + 92); 1430 z2 = _mm256_fmadd_pd(a5, w3, z2); 1431 1432 /* ninth column of a */ 1433 w0 = _mm256_set1_pd(work[8]); 1434 a0 = _mm256_loadu_pd(v + 96); 1435 z0 = _mm256_fmadd_pd(a0, w0, z0); 1436 a1 = _mm256_loadu_pd(v + 100); 1437 z1 = _mm256_fmadd_pd(a1, w0, z1); 1438 a2 = _mm256_loadu_pd(v + 104); 1439 z2 = _mm256_fmadd_pd(a2, w0, z2); 1440 1441 /* tenth column of a */ 1442 w1 = _mm256_set1_pd(work[9]); 1443 a3 = _mm256_loadu_pd(v + 108); 1444 z0 = _mm256_fmadd_pd(a3, w1, z0); 1445 a4 = _mm256_loadu_pd(v + 112); 1446 z1 = _mm256_fmadd_pd(a4, w1, z1); 1447 a5 = _mm256_loadu_pd(v + 116); 1448 z2 = _mm256_fmadd_pd(a5, w1, z2); 1449 1450 /* eleventh column of a */ 1451 w2 = _mm256_set1_pd(work[10]); 1452 a0 = _mm256_loadu_pd(v + 120); 1453 z0 = _mm256_fmadd_pd(a0, w2, z0); 1454 a1 = _mm256_loadu_pd(v + 124); 1455 z1 = _mm256_fmadd_pd(a1, w2, z1); 1456 a2 = _mm256_loadu_pd(v + 128); 1457 z2 = _mm256_fmadd_pd(a2, w2, z2); 1458 1459 /* twelveth column of a */ 1460 w3 = _mm256_set1_pd(work[11]); 1461 a3 = _mm256_loadu_pd(v + 132); 1462 z0 = _mm256_fmadd_pd(a3, w3, z0); 1463 a4 = _mm256_loadu_pd(v + 136); 1464 z1 = _mm256_fmadd_pd(a4, w3, z1); 1465 a5 = _mm256_loadu_pd(v + 140); 1466 z2 = _mm256_fmadd_pd(a5, w3, z2); 1467 1468 v += bs2; 1469 } 1470 if (usecprow) z = zarray + bs * ridx[i]; 1471 _mm256_storeu_pd(&z[0], z0); 1472 _mm256_storeu_pd(&z[4], z1); 1473 _mm256_storeu_pd(&z[8], z2); 1474 if (!usecprow) z += bs; 1475 } 1476 PetscCall(VecRestoreArrayRead(xx, &x)); 1477 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1478 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1479 PetscFunctionReturn(PETSC_SUCCESS); 1480 } 1481 #endif 1482 1483 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1484 /* Default MatMult for block size 15 */ 1485 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1486 { 1487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1488 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1489 const PetscScalar *x, *xb; 1490 PetscScalar *zarray, xv; 1491 const MatScalar *v; 1492 const PetscInt *ii, *ij = a->j, *idx; 1493 PetscInt mbs, i, j, k, n, *ridx = NULL; 1494 PetscBool usecprow = a->compressedrow.use; 1495 1496 PetscFunctionBegin; 1497 PetscCall(VecGetArrayRead(xx, &x)); 1498 PetscCall(VecGetArrayWrite(zz, &zarray)); 1499 1500 v = a->a; 1501 if (usecprow) { 1502 mbs = a->compressedrow.nrows; 1503 ii = a->compressedrow.i; 1504 ridx = a->compressedrow.rindex; 1505 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1506 } else { 1507 mbs = a->mbs; 1508 ii = a->i; 1509 z = zarray; 1510 } 1511 1512 for (i = 0; i < mbs; i++) { 1513 n = ii[i + 1] - ii[i]; 1514 idx = ij + ii[i]; 1515 sum1 = 0.0; 1516 sum2 = 0.0; 1517 sum3 = 0.0; 1518 sum4 = 0.0; 1519 sum5 = 0.0; 1520 sum6 = 0.0; 1521 sum7 = 0.0; 1522 sum8 = 0.0; 1523 sum9 = 0.0; 1524 sum10 = 0.0; 1525 sum11 = 0.0; 1526 sum12 = 0.0; 1527 sum13 = 0.0; 1528 sum14 = 0.0; 1529 sum15 = 0.0; 1530 1531 for (j = 0; j < n; j++) { 1532 xb = x + 15 * (idx[j]); 1533 1534 for (k = 0; k < 15; k++) { 1535 xv = xb[k]; 1536 sum1 += v[0] * xv; 1537 sum2 += v[1] * xv; 1538 sum3 += v[2] * xv; 1539 sum4 += v[3] * xv; 1540 sum5 += v[4] * xv; 1541 sum6 += v[5] * xv; 1542 sum7 += v[6] * xv; 1543 sum8 += v[7] * xv; 1544 sum9 += v[8] * xv; 1545 sum10 += v[9] * xv; 1546 sum11 += v[10] * xv; 1547 sum12 += v[11] * xv; 1548 sum13 += v[12] * xv; 1549 sum14 += v[13] * xv; 1550 sum15 += v[14] * xv; 1551 v += 15; 1552 } 1553 } 1554 if (usecprow) z = zarray + 15 * ridx[i]; 1555 z[0] = sum1; 1556 z[1] = sum2; 1557 z[2] = sum3; 1558 z[3] = sum4; 1559 z[4] = sum5; 1560 z[5] = sum6; 1561 z[6] = sum7; 1562 z[7] = sum8; 1563 z[8] = sum9; 1564 z[9] = sum10; 1565 z[10] = sum11; 1566 z[11] = sum12; 1567 z[12] = sum13; 1568 z[13] = sum14; 1569 z[14] = sum15; 1570 1571 if (!usecprow) z += 15; 1572 } 1573 1574 PetscCall(VecRestoreArrayRead(xx, &x)); 1575 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1576 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1577 PetscFunctionReturn(PETSC_SUCCESS); 1578 } 1579 1580 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1581 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1582 { 1583 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1584 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1585 const PetscScalar *x, *xb; 1586 PetscScalar x1, x2, x3, x4, *zarray; 1587 const MatScalar *v; 1588 const PetscInt *ii, *ij = a->j, *idx; 1589 PetscInt mbs, i, j, n, *ridx = NULL; 1590 PetscBool usecprow = a->compressedrow.use; 1591 1592 PetscFunctionBegin; 1593 PetscCall(VecGetArrayRead(xx, &x)); 1594 PetscCall(VecGetArrayWrite(zz, &zarray)); 1595 1596 v = a->a; 1597 if (usecprow) { 1598 mbs = a->compressedrow.nrows; 1599 ii = a->compressedrow.i; 1600 ridx = a->compressedrow.rindex; 1601 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1602 } else { 1603 mbs = a->mbs; 1604 ii = a->i; 1605 z = zarray; 1606 } 1607 1608 for (i = 0; i < mbs; i++) { 1609 n = ii[i + 1] - ii[i]; 1610 idx = ij + ii[i]; 1611 sum1 = 0.0; 1612 sum2 = 0.0; 1613 sum3 = 0.0; 1614 sum4 = 0.0; 1615 sum5 = 0.0; 1616 sum6 = 0.0; 1617 sum7 = 0.0; 1618 sum8 = 0.0; 1619 sum9 = 0.0; 1620 sum10 = 0.0; 1621 sum11 = 0.0; 1622 sum12 = 0.0; 1623 sum13 = 0.0; 1624 sum14 = 0.0; 1625 sum15 = 0.0; 1626 1627 for (j = 0; j < n; j++) { 1628 xb = x + 15 * (idx[j]); 1629 x1 = xb[0]; 1630 x2 = xb[1]; 1631 x3 = xb[2]; 1632 x4 = xb[3]; 1633 1634 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1635 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1636 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1637 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1638 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1639 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1640 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1641 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1642 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1643 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1644 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1645 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1646 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1647 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1648 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1649 1650 v += 60; 1651 1652 x1 = xb[4]; 1653 x2 = xb[5]; 1654 x3 = xb[6]; 1655 x4 = xb[7]; 1656 1657 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1658 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1659 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1660 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1661 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1662 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1663 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1664 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1665 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1666 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1667 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1668 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1669 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1670 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1671 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1672 v += 60; 1673 1674 x1 = xb[8]; 1675 x2 = xb[9]; 1676 x3 = xb[10]; 1677 x4 = xb[11]; 1678 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1679 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1680 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1681 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1682 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1683 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1684 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1685 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1686 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1687 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1688 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1689 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1690 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1691 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1692 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1693 v += 60; 1694 1695 x1 = xb[12]; 1696 x2 = xb[13]; 1697 x3 = xb[14]; 1698 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1699 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1700 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1701 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1702 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1703 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1704 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1705 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1706 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1707 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1708 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1709 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1710 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1711 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1712 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1713 v += 45; 1714 } 1715 if (usecprow) z = zarray + 15 * ridx[i]; 1716 z[0] = sum1; 1717 z[1] = sum2; 1718 z[2] = sum3; 1719 z[3] = sum4; 1720 z[4] = sum5; 1721 z[5] = sum6; 1722 z[6] = sum7; 1723 z[7] = sum8; 1724 z[8] = sum9; 1725 z[9] = sum10; 1726 z[10] = sum11; 1727 z[11] = sum12; 1728 z[12] = sum13; 1729 z[13] = sum14; 1730 z[14] = sum15; 1731 1732 if (!usecprow) z += 15; 1733 } 1734 1735 PetscCall(VecRestoreArrayRead(xx, &x)); 1736 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1737 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1738 PetscFunctionReturn(PETSC_SUCCESS); 1739 } 1740 1741 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1742 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1743 { 1744 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1745 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1746 const PetscScalar *x, *xb; 1747 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1748 const MatScalar *v; 1749 const PetscInt *ii, *ij = a->j, *idx; 1750 PetscInt mbs, i, j, n, *ridx = NULL; 1751 PetscBool usecprow = a->compressedrow.use; 1752 1753 PetscFunctionBegin; 1754 PetscCall(VecGetArrayRead(xx, &x)); 1755 PetscCall(VecGetArrayWrite(zz, &zarray)); 1756 1757 v = a->a; 1758 if (usecprow) { 1759 mbs = a->compressedrow.nrows; 1760 ii = a->compressedrow.i; 1761 ridx = a->compressedrow.rindex; 1762 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1763 } else { 1764 mbs = a->mbs; 1765 ii = a->i; 1766 z = zarray; 1767 } 1768 1769 for (i = 0; i < mbs; i++) { 1770 n = ii[i + 1] - ii[i]; 1771 idx = ij + ii[i]; 1772 sum1 = 0.0; 1773 sum2 = 0.0; 1774 sum3 = 0.0; 1775 sum4 = 0.0; 1776 sum5 = 0.0; 1777 sum6 = 0.0; 1778 sum7 = 0.0; 1779 sum8 = 0.0; 1780 sum9 = 0.0; 1781 sum10 = 0.0; 1782 sum11 = 0.0; 1783 sum12 = 0.0; 1784 sum13 = 0.0; 1785 sum14 = 0.0; 1786 sum15 = 0.0; 1787 1788 for (j = 0; j < n; j++) { 1789 xb = x + 15 * (idx[j]); 1790 x1 = xb[0]; 1791 x2 = xb[1]; 1792 x3 = xb[2]; 1793 x4 = xb[3]; 1794 x5 = xb[4]; 1795 x6 = xb[5]; 1796 x7 = xb[6]; 1797 x8 = xb[7]; 1798 1799 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1800 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1801 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1802 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1803 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1804 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1805 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1806 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1807 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1808 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1809 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1810 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1811 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1812 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1813 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1814 v += 120; 1815 1816 x1 = xb[8]; 1817 x2 = xb[9]; 1818 x3 = xb[10]; 1819 x4 = xb[11]; 1820 x5 = xb[12]; 1821 x6 = xb[13]; 1822 x7 = xb[14]; 1823 1824 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1825 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1826 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1827 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1828 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1829 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1830 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1831 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1832 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1833 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1834 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1835 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1836 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1837 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1838 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1839 v += 105; 1840 } 1841 if (usecprow) z = zarray + 15 * ridx[i]; 1842 z[0] = sum1; 1843 z[1] = sum2; 1844 z[2] = sum3; 1845 z[3] = sum4; 1846 z[4] = sum5; 1847 z[5] = sum6; 1848 z[6] = sum7; 1849 z[7] = sum8; 1850 z[8] = sum9; 1851 z[9] = sum10; 1852 z[10] = sum11; 1853 z[11] = sum12; 1854 z[12] = sum13; 1855 z[13] = sum14; 1856 z[14] = sum15; 1857 1858 if (!usecprow) z += 15; 1859 } 1860 1861 PetscCall(VecRestoreArrayRead(xx, &x)); 1862 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1863 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1864 PetscFunctionReturn(PETSC_SUCCESS); 1865 } 1866 1867 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1868 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1869 { 1870 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1871 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1872 const PetscScalar *x, *xb; 1873 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1874 const MatScalar *v; 1875 const PetscInt *ii, *ij = a->j, *idx; 1876 PetscInt mbs, i, j, n, *ridx = NULL; 1877 PetscBool usecprow = a->compressedrow.use; 1878 1879 PetscFunctionBegin; 1880 PetscCall(VecGetArrayRead(xx, &x)); 1881 PetscCall(VecGetArrayWrite(zz, &zarray)); 1882 1883 v = a->a; 1884 if (usecprow) { 1885 mbs = a->compressedrow.nrows; 1886 ii = a->compressedrow.i; 1887 ridx = a->compressedrow.rindex; 1888 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1889 } else { 1890 mbs = a->mbs; 1891 ii = a->i; 1892 z = zarray; 1893 } 1894 1895 for (i = 0; i < mbs; i++) { 1896 n = ii[i + 1] - ii[i]; 1897 idx = ij + ii[i]; 1898 sum1 = 0.0; 1899 sum2 = 0.0; 1900 sum3 = 0.0; 1901 sum4 = 0.0; 1902 sum5 = 0.0; 1903 sum6 = 0.0; 1904 sum7 = 0.0; 1905 sum8 = 0.0; 1906 sum9 = 0.0; 1907 sum10 = 0.0; 1908 sum11 = 0.0; 1909 sum12 = 0.0; 1910 sum13 = 0.0; 1911 sum14 = 0.0; 1912 sum15 = 0.0; 1913 1914 for (j = 0; j < n; j++) { 1915 xb = x + 15 * (idx[j]); 1916 x1 = xb[0]; 1917 x2 = xb[1]; 1918 x3 = xb[2]; 1919 x4 = xb[3]; 1920 x5 = xb[4]; 1921 x6 = xb[5]; 1922 x7 = xb[6]; 1923 x8 = xb[7]; 1924 x9 = xb[8]; 1925 x10 = xb[9]; 1926 x11 = xb[10]; 1927 x12 = xb[11]; 1928 x13 = xb[12]; 1929 x14 = xb[13]; 1930 x15 = xb[14]; 1931 1932 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1933 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1934 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1935 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1936 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1937 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1938 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1939 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1940 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1941 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1942 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1943 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1944 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1945 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1946 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1947 v += 225; 1948 } 1949 if (usecprow) z = zarray + 15 * ridx[i]; 1950 z[0] = sum1; 1951 z[1] = sum2; 1952 z[2] = sum3; 1953 z[3] = sum4; 1954 z[4] = sum5; 1955 z[5] = sum6; 1956 z[6] = sum7; 1957 z[7] = sum8; 1958 z[8] = sum9; 1959 z[9] = sum10; 1960 z[10] = sum11; 1961 z[11] = sum12; 1962 z[12] = sum13; 1963 z[13] = sum14; 1964 z[14] = sum15; 1965 1966 if (!usecprow) z += 15; 1967 } 1968 1969 PetscCall(VecRestoreArrayRead(xx, &x)); 1970 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1971 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1972 PetscFunctionReturn(PETSC_SUCCESS); 1973 } 1974 1975 /* 1976 This will not work with MatScalar == float because it calls the BLAS 1977 */ 1978 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 1979 { 1980 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1981 PetscScalar *z = NULL, *work, *workt, *zarray; 1982 const PetscScalar *x, *xb; 1983 const MatScalar *v; 1984 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1985 const PetscInt *idx, *ii, *ridx = NULL; 1986 PetscInt ncols, k; 1987 PetscBool usecprow = a->compressedrow.use; 1988 1989 PetscFunctionBegin; 1990 PetscCall(VecGetArrayRead(xx, &x)); 1991 PetscCall(VecGetArrayWrite(zz, &zarray)); 1992 1993 idx = a->j; 1994 v = a->a; 1995 if (usecprow) { 1996 mbs = a->compressedrow.nrows; 1997 ii = a->compressedrow.i; 1998 ridx = a->compressedrow.rindex; 1999 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2000 } else { 2001 mbs = a->mbs; 2002 ii = a->i; 2003 z = zarray; 2004 } 2005 2006 if (!a->mult_work) { 2007 k = PetscMax(A->rmap->n, A->cmap->n); 2008 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2009 } 2010 work = a->mult_work; 2011 for (i = 0; i < mbs; i++) { 2012 n = ii[1] - ii[0]; 2013 ii++; 2014 ncols = n * bs; 2015 workt = work; 2016 for (j = 0; j < n; j++) { 2017 xb = x + bs * (*idx++); 2018 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2019 workt += bs; 2020 } 2021 if (usecprow) z = zarray + bs * ridx[i]; 2022 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2023 v += n * bs2; 2024 if (!usecprow) z += bs; 2025 } 2026 PetscCall(VecRestoreArrayRead(xx, &x)); 2027 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2028 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2029 PetscFunctionReturn(PETSC_SUCCESS); 2030 } 2031 2032 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2033 { 2034 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2035 const PetscScalar *x; 2036 PetscScalar *y, *z, sum; 2037 const MatScalar *v; 2038 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2039 const PetscInt *idx, *ii; 2040 PetscBool usecprow = a->compressedrow.use; 2041 2042 PetscFunctionBegin; 2043 PetscCall(VecGetArrayRead(xx, &x)); 2044 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2045 2046 idx = a->j; 2047 v = a->a; 2048 if (usecprow) { 2049 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 2050 mbs = a->compressedrow.nrows; 2051 ii = a->compressedrow.i; 2052 ridx = a->compressedrow.rindex; 2053 } else { 2054 ii = a->i; 2055 } 2056 2057 for (i = 0; i < mbs; i++) { 2058 n = ii[1] - ii[0]; 2059 ii++; 2060 if (!usecprow) { 2061 sum = y[i]; 2062 } else { 2063 sum = y[ridx[i]]; 2064 } 2065 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2066 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2067 PetscSparseDensePlusDot(sum, x, v, idx, n); 2068 v += n; 2069 idx += n; 2070 if (usecprow) { 2071 z[ridx[i]] = sum; 2072 } else { 2073 z[i] = sum; 2074 } 2075 } 2076 PetscCall(VecRestoreArrayRead(xx, &x)); 2077 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2078 PetscCall(PetscLogFlops(2.0 * a->nz)); 2079 PetscFunctionReturn(PETSC_SUCCESS); 2080 } 2081 2082 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2083 { 2084 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2085 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2086 const PetscScalar *x, *xb; 2087 PetscScalar x1, x2, *yarray, *zarray; 2088 const MatScalar *v; 2089 PetscInt mbs = a->mbs, i, n, j; 2090 const PetscInt *idx, *ii, *ridx = NULL; 2091 PetscBool usecprow = a->compressedrow.use; 2092 2093 PetscFunctionBegin; 2094 PetscCall(VecGetArrayRead(xx, &x)); 2095 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2096 2097 idx = a->j; 2098 v = a->a; 2099 if (usecprow) { 2100 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 2101 mbs = a->compressedrow.nrows; 2102 ii = a->compressedrow.i; 2103 ridx = a->compressedrow.rindex; 2104 } else { 2105 ii = a->i; 2106 y = yarray; 2107 z = zarray; 2108 } 2109 2110 for (i = 0; i < mbs; i++) { 2111 n = ii[1] - ii[0]; 2112 ii++; 2113 if (usecprow) { 2114 z = zarray + 2 * ridx[i]; 2115 y = yarray + 2 * ridx[i]; 2116 } 2117 sum1 = y[0]; 2118 sum2 = y[1]; 2119 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2120 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2121 for (j = 0; j < n; j++) { 2122 xb = x + 2 * (*idx++); 2123 x1 = xb[0]; 2124 x2 = xb[1]; 2125 2126 sum1 += v[0] * x1 + v[2] * x2; 2127 sum2 += v[1] * x1 + v[3] * x2; 2128 v += 4; 2129 } 2130 z[0] = sum1; 2131 z[1] = sum2; 2132 if (!usecprow) { 2133 z += 2; 2134 y += 2; 2135 } 2136 } 2137 PetscCall(VecRestoreArrayRead(xx, &x)); 2138 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2139 PetscCall(PetscLogFlops(4.0 * a->nz)); 2140 PetscFunctionReturn(PETSC_SUCCESS); 2141 } 2142 2143 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2144 { 2145 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2146 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2147 const PetscScalar *x, *xb; 2148 const MatScalar *v; 2149 PetscInt mbs = a->mbs, i, j, n; 2150 const PetscInt *idx, *ii, *ridx = NULL; 2151 PetscBool usecprow = a->compressedrow.use; 2152 2153 PetscFunctionBegin; 2154 PetscCall(VecGetArrayRead(xx, &x)); 2155 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2156 2157 idx = a->j; 2158 v = a->a; 2159 if (usecprow) { 2160 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 2161 mbs = a->compressedrow.nrows; 2162 ii = a->compressedrow.i; 2163 ridx = a->compressedrow.rindex; 2164 } else { 2165 ii = a->i; 2166 y = yarray; 2167 z = zarray; 2168 } 2169 2170 for (i = 0; i < mbs; i++) { 2171 n = ii[1] - ii[0]; 2172 ii++; 2173 if (usecprow) { 2174 z = zarray + 3 * ridx[i]; 2175 y = yarray + 3 * ridx[i]; 2176 } 2177 sum1 = y[0]; 2178 sum2 = y[1]; 2179 sum3 = y[2]; 2180 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2181 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2182 for (j = 0; j < n; j++) { 2183 xb = x + 3 * (*idx++); 2184 x1 = xb[0]; 2185 x2 = xb[1]; 2186 x3 = xb[2]; 2187 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2188 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2189 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2190 v += 9; 2191 } 2192 z[0] = sum1; 2193 z[1] = sum2; 2194 z[2] = sum3; 2195 if (!usecprow) { 2196 z += 3; 2197 y += 3; 2198 } 2199 } 2200 PetscCall(VecRestoreArrayRead(xx, &x)); 2201 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2202 PetscCall(PetscLogFlops(18.0 * a->nz)); 2203 PetscFunctionReturn(PETSC_SUCCESS); 2204 } 2205 2206 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2207 { 2208 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2209 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2210 const PetscScalar *x, *xb; 2211 const MatScalar *v; 2212 PetscInt mbs = a->mbs, i, j, n; 2213 const PetscInt *idx, *ii, *ridx = NULL; 2214 PetscBool usecprow = a->compressedrow.use; 2215 2216 PetscFunctionBegin; 2217 PetscCall(VecGetArrayRead(xx, &x)); 2218 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2219 2220 idx = a->j; 2221 v = a->a; 2222 if (usecprow) { 2223 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 2224 mbs = a->compressedrow.nrows; 2225 ii = a->compressedrow.i; 2226 ridx = a->compressedrow.rindex; 2227 } else { 2228 ii = a->i; 2229 y = yarray; 2230 z = zarray; 2231 } 2232 2233 for (i = 0; i < mbs; i++) { 2234 n = ii[1] - ii[0]; 2235 ii++; 2236 if (usecprow) { 2237 z = zarray + 4 * ridx[i]; 2238 y = yarray + 4 * ridx[i]; 2239 } 2240 sum1 = y[0]; 2241 sum2 = y[1]; 2242 sum3 = y[2]; 2243 sum4 = y[3]; 2244 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2245 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2246 for (j = 0; j < n; j++) { 2247 xb = x + 4 * (*idx++); 2248 x1 = xb[0]; 2249 x2 = xb[1]; 2250 x3 = xb[2]; 2251 x4 = xb[3]; 2252 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2253 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2254 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2255 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2256 v += 16; 2257 } 2258 z[0] = sum1; 2259 z[1] = sum2; 2260 z[2] = sum3; 2261 z[3] = sum4; 2262 if (!usecprow) { 2263 z += 4; 2264 y += 4; 2265 } 2266 } 2267 PetscCall(VecRestoreArrayRead(xx, &x)); 2268 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2269 PetscCall(PetscLogFlops(32.0 * a->nz)); 2270 PetscFunctionReturn(PETSC_SUCCESS); 2271 } 2272 2273 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2274 { 2275 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2276 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2277 const PetscScalar *x, *xb; 2278 PetscScalar *yarray, *zarray; 2279 const MatScalar *v; 2280 PetscInt mbs = a->mbs, i, j, n; 2281 const PetscInt *idx, *ii, *ridx = NULL; 2282 PetscBool usecprow = a->compressedrow.use; 2283 2284 PetscFunctionBegin; 2285 PetscCall(VecGetArrayRead(xx, &x)); 2286 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2287 2288 idx = a->j; 2289 v = a->a; 2290 if (usecprow) { 2291 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 2292 mbs = a->compressedrow.nrows; 2293 ii = a->compressedrow.i; 2294 ridx = a->compressedrow.rindex; 2295 } else { 2296 ii = a->i; 2297 y = yarray; 2298 z = zarray; 2299 } 2300 2301 for (i = 0; i < mbs; i++) { 2302 n = ii[1] - ii[0]; 2303 ii++; 2304 if (usecprow) { 2305 z = zarray + 5 * ridx[i]; 2306 y = yarray + 5 * ridx[i]; 2307 } 2308 sum1 = y[0]; 2309 sum2 = y[1]; 2310 sum3 = y[2]; 2311 sum4 = y[3]; 2312 sum5 = y[4]; 2313 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2314 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2315 for (j = 0; j < n; j++) { 2316 xb = x + 5 * (*idx++); 2317 x1 = xb[0]; 2318 x2 = xb[1]; 2319 x3 = xb[2]; 2320 x4 = xb[3]; 2321 x5 = xb[4]; 2322 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2323 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2324 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2325 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2326 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2327 v += 25; 2328 } 2329 z[0] = sum1; 2330 z[1] = sum2; 2331 z[2] = sum3; 2332 z[3] = sum4; 2333 z[4] = sum5; 2334 if (!usecprow) { 2335 z += 5; 2336 y += 5; 2337 } 2338 } 2339 PetscCall(VecRestoreArrayRead(xx, &x)); 2340 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2341 PetscCall(PetscLogFlops(50.0 * a->nz)); 2342 PetscFunctionReturn(PETSC_SUCCESS); 2343 } 2344 2345 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2346 { 2347 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2348 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2349 const PetscScalar *x, *xb; 2350 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2351 const MatScalar *v; 2352 PetscInt mbs = a->mbs, i, j, n; 2353 const PetscInt *idx, *ii, *ridx = NULL; 2354 PetscBool usecprow = a->compressedrow.use; 2355 2356 PetscFunctionBegin; 2357 PetscCall(VecGetArrayRead(xx, &x)); 2358 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2359 2360 idx = a->j; 2361 v = a->a; 2362 if (usecprow) { 2363 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 2364 mbs = a->compressedrow.nrows; 2365 ii = a->compressedrow.i; 2366 ridx = a->compressedrow.rindex; 2367 } else { 2368 ii = a->i; 2369 y = yarray; 2370 z = zarray; 2371 } 2372 2373 for (i = 0; i < mbs; i++) { 2374 n = ii[1] - ii[0]; 2375 ii++; 2376 if (usecprow) { 2377 z = zarray + 6 * ridx[i]; 2378 y = yarray + 6 * ridx[i]; 2379 } 2380 sum1 = y[0]; 2381 sum2 = y[1]; 2382 sum3 = y[2]; 2383 sum4 = y[3]; 2384 sum5 = y[4]; 2385 sum6 = y[5]; 2386 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2387 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2388 for (j = 0; j < n; j++) { 2389 xb = x + 6 * (*idx++); 2390 x1 = xb[0]; 2391 x2 = xb[1]; 2392 x3 = xb[2]; 2393 x4 = xb[3]; 2394 x5 = xb[4]; 2395 x6 = xb[5]; 2396 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2397 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2398 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2399 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2400 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2401 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2402 v += 36; 2403 } 2404 z[0] = sum1; 2405 z[1] = sum2; 2406 z[2] = sum3; 2407 z[3] = sum4; 2408 z[4] = sum5; 2409 z[5] = sum6; 2410 if (!usecprow) { 2411 z += 6; 2412 y += 6; 2413 } 2414 } 2415 PetscCall(VecRestoreArrayRead(xx, &x)); 2416 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2417 PetscCall(PetscLogFlops(72.0 * a->nz)); 2418 PetscFunctionReturn(PETSC_SUCCESS); 2419 } 2420 2421 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2422 { 2423 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2424 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2425 const PetscScalar *x, *xb; 2426 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2427 const MatScalar *v; 2428 PetscInt mbs = a->mbs, i, j, n; 2429 const PetscInt *idx, *ii, *ridx = NULL; 2430 PetscBool usecprow = a->compressedrow.use; 2431 2432 PetscFunctionBegin; 2433 PetscCall(VecGetArrayRead(xx, &x)); 2434 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2435 2436 idx = a->j; 2437 v = a->a; 2438 if (usecprow) { 2439 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2440 mbs = a->compressedrow.nrows; 2441 ii = a->compressedrow.i; 2442 ridx = a->compressedrow.rindex; 2443 } else { 2444 ii = a->i; 2445 y = yarray; 2446 z = zarray; 2447 } 2448 2449 for (i = 0; i < mbs; i++) { 2450 n = ii[1] - ii[0]; 2451 ii++; 2452 if (usecprow) { 2453 z = zarray + 7 * ridx[i]; 2454 y = yarray + 7 * ridx[i]; 2455 } 2456 sum1 = y[0]; 2457 sum2 = y[1]; 2458 sum3 = y[2]; 2459 sum4 = y[3]; 2460 sum5 = y[4]; 2461 sum6 = y[5]; 2462 sum7 = y[6]; 2463 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2464 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2465 for (j = 0; j < n; j++) { 2466 xb = x + 7 * (*idx++); 2467 x1 = xb[0]; 2468 x2 = xb[1]; 2469 x3 = xb[2]; 2470 x4 = xb[3]; 2471 x5 = xb[4]; 2472 x6 = xb[5]; 2473 x7 = xb[6]; 2474 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2475 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2476 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2477 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2478 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2479 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2480 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2481 v += 49; 2482 } 2483 z[0] = sum1; 2484 z[1] = sum2; 2485 z[2] = sum3; 2486 z[3] = sum4; 2487 z[4] = sum5; 2488 z[5] = sum6; 2489 z[6] = sum7; 2490 if (!usecprow) { 2491 z += 7; 2492 y += 7; 2493 } 2494 } 2495 PetscCall(VecRestoreArrayRead(xx, &x)); 2496 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2497 PetscCall(PetscLogFlops(98.0 * a->nz)); 2498 PetscFunctionReturn(PETSC_SUCCESS); 2499 } 2500 2501 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2502 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2503 { 2504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2505 PetscScalar *z = NULL, *work, *workt, *zarray; 2506 const PetscScalar *x, *xb; 2507 const MatScalar *v; 2508 PetscInt mbs, i, j, n; 2509 PetscInt k; 2510 PetscBool usecprow = a->compressedrow.use; 2511 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2512 2513 __m256d a0, a1, a2, a3, a4, a5; 2514 __m256d w0, w1, w2, w3; 2515 __m256d z0, z1, z2; 2516 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2517 2518 PetscFunctionBegin; 2519 PetscCall(VecCopy(yy, zz)); 2520 PetscCall(VecGetArrayRead(xx, &x)); 2521 PetscCall(VecGetArray(zz, &zarray)); 2522 2523 idx = a->j; 2524 v = a->a; 2525 if (usecprow) { 2526 mbs = a->compressedrow.nrows; 2527 ii = a->compressedrow.i; 2528 ridx = a->compressedrow.rindex; 2529 } else { 2530 mbs = a->mbs; 2531 ii = a->i; 2532 z = zarray; 2533 } 2534 2535 if (!a->mult_work) { 2536 k = PetscMax(A->rmap->n, A->cmap->n); 2537 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2538 } 2539 2540 work = a->mult_work; 2541 for (i = 0; i < mbs; i++) { 2542 n = ii[1] - ii[0]; 2543 ii++; 2544 workt = work; 2545 for (j = 0; j < n; j++) { 2546 xb = x + bs * (*idx++); 2547 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2548 workt += bs; 2549 } 2550 if (usecprow) z = zarray + bs * ridx[i]; 2551 2552 z0 = _mm256_loadu_pd(&z[0]); 2553 z1 = _mm256_loadu_pd(&z[4]); 2554 z2 = _mm256_set1_pd(z[8]); 2555 2556 for (j = 0; j < n; j++) { 2557 /* first column of a */ 2558 w0 = _mm256_set1_pd(work[j * 9]); 2559 a0 = _mm256_loadu_pd(&v[j * 81]); 2560 z0 = _mm256_fmadd_pd(a0, w0, z0); 2561 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2562 z1 = _mm256_fmadd_pd(a1, w0, z1); 2563 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2564 z2 = _mm256_fmadd_pd(a2, w0, z2); 2565 2566 /* second column of a */ 2567 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2568 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2569 z0 = _mm256_fmadd_pd(a0, w1, z0); 2570 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2571 z1 = _mm256_fmadd_pd(a1, w1, z1); 2572 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2573 z2 = _mm256_fmadd_pd(a2, w1, z2); 2574 2575 /* third column of a */ 2576 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2577 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2578 z0 = _mm256_fmadd_pd(a3, w2, z0); 2579 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2580 z1 = _mm256_fmadd_pd(a4, w2, z1); 2581 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2582 z2 = _mm256_fmadd_pd(a5, w2, z2); 2583 2584 /* fourth column of a */ 2585 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2586 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2587 z0 = _mm256_fmadd_pd(a0, w3, z0); 2588 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2589 z1 = _mm256_fmadd_pd(a1, w3, z1); 2590 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2591 z2 = _mm256_fmadd_pd(a2, w3, z2); 2592 2593 /* fifth column of a */ 2594 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2595 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2596 z0 = _mm256_fmadd_pd(a3, w0, z0); 2597 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2598 z1 = _mm256_fmadd_pd(a4, w0, z1); 2599 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2600 z2 = _mm256_fmadd_pd(a5, w0, z2); 2601 2602 /* sixth column of a */ 2603 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2604 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2605 z0 = _mm256_fmadd_pd(a0, w1, z0); 2606 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2607 z1 = _mm256_fmadd_pd(a1, w1, z1); 2608 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2609 z2 = _mm256_fmadd_pd(a2, w1, z2); 2610 2611 /* seventh column of a */ 2612 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2613 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2614 z0 = _mm256_fmadd_pd(a0, w2, z0); 2615 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2616 z1 = _mm256_fmadd_pd(a1, w2, z1); 2617 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2618 z2 = _mm256_fmadd_pd(a2, w2, z2); 2619 2620 /* eighth column of a */ 2621 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2622 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2623 z0 = _mm256_fmadd_pd(a3, w3, z0); 2624 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2625 z1 = _mm256_fmadd_pd(a4, w3, z1); 2626 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2627 z2 = _mm256_fmadd_pd(a5, w3, z2); 2628 2629 /* ninth column of a */ 2630 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2631 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2632 z0 = _mm256_fmadd_pd(a0, w0, z0); 2633 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2634 z1 = _mm256_fmadd_pd(a1, w0, z1); 2635 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2636 z2 = _mm256_fmadd_pd(a2, w0, z2); 2637 } 2638 2639 _mm256_storeu_pd(&z[0], z0); 2640 _mm256_storeu_pd(&z[4], z1); 2641 _mm256_maskstore_pd(&z[8], mask1, z2); 2642 2643 v += n * bs2; 2644 if (!usecprow) z += bs; 2645 } 2646 PetscCall(VecRestoreArrayRead(xx, &x)); 2647 PetscCall(VecRestoreArray(zz, &zarray)); 2648 PetscCall(PetscLogFlops(162.0 * a->nz)); 2649 PetscFunctionReturn(PETSC_SUCCESS); 2650 } 2651 #endif 2652 2653 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2654 { 2655 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2656 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2657 const PetscScalar *x, *xb; 2658 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2659 const MatScalar *v; 2660 PetscInt mbs = a->mbs, i, j, n; 2661 const PetscInt *idx, *ii, *ridx = NULL; 2662 PetscBool usecprow = a->compressedrow.use; 2663 2664 PetscFunctionBegin; 2665 PetscCall(VecGetArrayRead(xx, &x)); 2666 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2667 2668 idx = a->j; 2669 v = a->a; 2670 if (usecprow) { 2671 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2672 mbs = a->compressedrow.nrows; 2673 ii = a->compressedrow.i; 2674 ridx = a->compressedrow.rindex; 2675 } else { 2676 ii = a->i; 2677 y = yarray; 2678 z = zarray; 2679 } 2680 2681 for (i = 0; i < mbs; i++) { 2682 n = ii[1] - ii[0]; 2683 ii++; 2684 if (usecprow) { 2685 z = zarray + 11 * ridx[i]; 2686 y = yarray + 11 * ridx[i]; 2687 } 2688 sum1 = y[0]; 2689 sum2 = y[1]; 2690 sum3 = y[2]; 2691 sum4 = y[3]; 2692 sum5 = y[4]; 2693 sum6 = y[5]; 2694 sum7 = y[6]; 2695 sum8 = y[7]; 2696 sum9 = y[8]; 2697 sum10 = y[9]; 2698 sum11 = y[10]; 2699 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2700 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2701 for (j = 0; j < n; j++) { 2702 xb = x + 11 * (*idx++); 2703 x1 = xb[0]; 2704 x2 = xb[1]; 2705 x3 = xb[2]; 2706 x4 = xb[3]; 2707 x5 = xb[4]; 2708 x6 = xb[5]; 2709 x7 = xb[6]; 2710 x8 = xb[7]; 2711 x9 = xb[8]; 2712 x10 = xb[9]; 2713 x11 = xb[10]; 2714 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2715 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2716 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2717 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2718 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2719 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2720 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2721 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2722 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2723 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2724 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2725 v += 121; 2726 } 2727 z[0] = sum1; 2728 z[1] = sum2; 2729 z[2] = sum3; 2730 z[3] = sum4; 2731 z[4] = sum5; 2732 z[5] = sum6; 2733 z[6] = sum7; 2734 z[7] = sum8; 2735 z[8] = sum9; 2736 z[9] = sum10; 2737 z[10] = sum11; 2738 if (!usecprow) { 2739 z += 11; 2740 y += 11; 2741 } 2742 } 2743 PetscCall(VecRestoreArrayRead(xx, &x)); 2744 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2745 PetscCall(PetscLogFlops(242.0 * a->nz)); 2746 PetscFunctionReturn(PETSC_SUCCESS); 2747 } 2748 2749 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2750 { 2751 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2752 PetscScalar *z = NULL, *work, *workt, *zarray; 2753 const PetscScalar *x, *xb; 2754 const MatScalar *v; 2755 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2756 PetscInt ncols, k; 2757 const PetscInt *ridx = NULL, *idx, *ii; 2758 PetscBool usecprow = a->compressedrow.use; 2759 2760 PetscFunctionBegin; 2761 PetscCall(VecCopy(yy, zz)); 2762 PetscCall(VecGetArrayRead(xx, &x)); 2763 PetscCall(VecGetArray(zz, &zarray)); 2764 2765 idx = a->j; 2766 v = a->a; 2767 if (usecprow) { 2768 mbs = a->compressedrow.nrows; 2769 ii = a->compressedrow.i; 2770 ridx = a->compressedrow.rindex; 2771 } else { 2772 mbs = a->mbs; 2773 ii = a->i; 2774 z = zarray; 2775 } 2776 2777 if (!a->mult_work) { 2778 k = PetscMax(A->rmap->n, A->cmap->n); 2779 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2780 } 2781 work = a->mult_work; 2782 for (i = 0; i < mbs; i++) { 2783 n = ii[1] - ii[0]; 2784 ii++; 2785 ncols = n * bs; 2786 workt = work; 2787 for (j = 0; j < n; j++) { 2788 xb = x + bs * (*idx++); 2789 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2790 workt += bs; 2791 } 2792 if (usecprow) z = zarray + bs * ridx[i]; 2793 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2794 v += n * bs2; 2795 if (!usecprow) z += bs; 2796 } 2797 PetscCall(VecRestoreArrayRead(xx, &x)); 2798 PetscCall(VecRestoreArray(zz, &zarray)); 2799 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2800 PetscFunctionReturn(PETSC_SUCCESS); 2801 } 2802 2803 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2804 { 2805 PetscScalar zero = 0.0; 2806 2807 PetscFunctionBegin; 2808 PetscCall(VecSet(zz, zero)); 2809 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2810 PetscFunctionReturn(PETSC_SUCCESS); 2811 } 2812 2813 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2814 { 2815 PetscScalar zero = 0.0; 2816 2817 PetscFunctionBegin; 2818 PetscCall(VecSet(zz, zero)); 2819 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2820 PetscFunctionReturn(PETSC_SUCCESS); 2821 } 2822 2823 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2824 { 2825 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2826 PetscScalar *z, x1, x2, x3, x4, x5; 2827 const PetscScalar *x, *xb = NULL; 2828 const MatScalar *v; 2829 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2830 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2831 Mat_CompressedRow cprow = a->compressedrow; 2832 PetscBool usecprow = cprow.use; 2833 2834 PetscFunctionBegin; 2835 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2836 PetscCall(VecGetArrayRead(xx, &x)); 2837 PetscCall(VecGetArray(zz, &z)); 2838 2839 idx = a->j; 2840 v = a->a; 2841 if (usecprow) { 2842 mbs = cprow.nrows; 2843 ii = cprow.i; 2844 ridx = cprow.rindex; 2845 } else { 2846 mbs = a->mbs; 2847 ii = a->i; 2848 xb = x; 2849 } 2850 2851 switch (bs) { 2852 case 1: 2853 for (i = 0; i < mbs; i++) { 2854 if (usecprow) xb = x + ridx[i]; 2855 x1 = xb[0]; 2856 ib = idx + ii[0]; 2857 n = ii[1] - ii[0]; 2858 ii++; 2859 for (j = 0; j < n; j++) { 2860 rval = ib[j]; 2861 z[rval] += PetscConj(*v) * x1; 2862 v++; 2863 } 2864 if (!usecprow) xb++; 2865 } 2866 break; 2867 case 2: 2868 for (i = 0; i < mbs; i++) { 2869 if (usecprow) xb = x + 2 * ridx[i]; 2870 x1 = xb[0]; 2871 x2 = xb[1]; 2872 ib = idx + ii[0]; 2873 n = ii[1] - ii[0]; 2874 ii++; 2875 for (j = 0; j < n; j++) { 2876 rval = ib[j] * 2; 2877 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2878 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2879 v += 4; 2880 } 2881 if (!usecprow) xb += 2; 2882 } 2883 break; 2884 case 3: 2885 for (i = 0; i < mbs; i++) { 2886 if (usecprow) xb = x + 3 * ridx[i]; 2887 x1 = xb[0]; 2888 x2 = xb[1]; 2889 x3 = xb[2]; 2890 ib = idx + ii[0]; 2891 n = ii[1] - ii[0]; 2892 ii++; 2893 for (j = 0; j < n; j++) { 2894 rval = ib[j] * 3; 2895 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2896 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2897 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2898 v += 9; 2899 } 2900 if (!usecprow) xb += 3; 2901 } 2902 break; 2903 case 4: 2904 for (i = 0; i < mbs; i++) { 2905 if (usecprow) xb = x + 4 * ridx[i]; 2906 x1 = xb[0]; 2907 x2 = xb[1]; 2908 x3 = xb[2]; 2909 x4 = xb[3]; 2910 ib = idx + ii[0]; 2911 n = ii[1] - ii[0]; 2912 ii++; 2913 for (j = 0; j < n; j++) { 2914 rval = ib[j] * 4; 2915 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2916 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2917 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2918 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2919 v += 16; 2920 } 2921 if (!usecprow) xb += 4; 2922 } 2923 break; 2924 case 5: 2925 for (i = 0; i < mbs; i++) { 2926 if (usecprow) xb = x + 5 * ridx[i]; 2927 x1 = xb[0]; 2928 x2 = xb[1]; 2929 x3 = xb[2]; 2930 x4 = xb[3]; 2931 x5 = xb[4]; 2932 ib = idx + ii[0]; 2933 n = ii[1] - ii[0]; 2934 ii++; 2935 for (j = 0; j < n; j++) { 2936 rval = ib[j] * 5; 2937 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2938 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2939 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2940 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2941 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2942 v += 25; 2943 } 2944 if (!usecprow) xb += 5; 2945 } 2946 break; 2947 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2948 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2949 #if 0 2950 { 2951 PetscInt ncols,k,bs2=a->bs2; 2952 PetscScalar *work,*workt,zb; 2953 const PetscScalar *xtmp; 2954 if (!a->mult_work) { 2955 k = PetscMax(A->rmap->n,A->cmap->n); 2956 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2957 } 2958 work = a->mult_work; 2959 xtmp = x; 2960 for (i=0; i<mbs; i++) { 2961 n = ii[1] - ii[0]; ii++; 2962 ncols = n*bs; 2963 PetscCall(PetscArrayzero(work,ncols)); 2964 if (usecprow) xtmp = x + bs*ridx[i]; 2965 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2966 v += n*bs2; 2967 if (!usecprow) xtmp += bs; 2968 workt = work; 2969 for (j=0; j<n; j++) { 2970 zb = z + bs*(*idx++); 2971 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2972 workt += bs; 2973 } 2974 } 2975 } 2976 #endif 2977 } 2978 PetscCall(VecRestoreArrayRead(xx, &x)); 2979 PetscCall(VecRestoreArray(zz, &z)); 2980 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 2981 PetscFunctionReturn(PETSC_SUCCESS); 2982 } 2983 2984 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2985 { 2986 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2987 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2988 const PetscScalar *x, *xb = NULL; 2989 const MatScalar *v; 2990 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2991 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2992 Mat_CompressedRow cprow = a->compressedrow; 2993 PetscBool usecprow = cprow.use; 2994 2995 PetscFunctionBegin; 2996 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2997 PetscCall(VecGetArrayRead(xx, &x)); 2998 PetscCall(VecGetArray(zz, &z)); 2999 3000 idx = a->j; 3001 v = a->a; 3002 if (usecprow) { 3003 mbs = cprow.nrows; 3004 ii = cprow.i; 3005 ridx = cprow.rindex; 3006 } else { 3007 mbs = a->mbs; 3008 ii = a->i; 3009 xb = x; 3010 } 3011 3012 switch (bs) { 3013 case 1: 3014 for (i = 0; i < mbs; i++) { 3015 if (usecprow) xb = x + ridx[i]; 3016 x1 = xb[0]; 3017 ib = idx + ii[0]; 3018 n = ii[1] - ii[0]; 3019 ii++; 3020 for (j = 0; j < n; j++) { 3021 rval = ib[j]; 3022 z[rval] += *v * x1; 3023 v++; 3024 } 3025 if (!usecprow) xb++; 3026 } 3027 break; 3028 case 2: 3029 for (i = 0; i < mbs; i++) { 3030 if (usecprow) xb = x + 2 * ridx[i]; 3031 x1 = xb[0]; 3032 x2 = xb[1]; 3033 ib = idx + ii[0]; 3034 n = ii[1] - ii[0]; 3035 ii++; 3036 for (j = 0; j < n; j++) { 3037 rval = ib[j] * 2; 3038 z[rval++] += v[0] * x1 + v[1] * x2; 3039 z[rval++] += v[2] * x1 + v[3] * x2; 3040 v += 4; 3041 } 3042 if (!usecprow) xb += 2; 3043 } 3044 break; 3045 case 3: 3046 for (i = 0; i < mbs; i++) { 3047 if (usecprow) xb = x + 3 * ridx[i]; 3048 x1 = xb[0]; 3049 x2 = xb[1]; 3050 x3 = xb[2]; 3051 ib = idx + ii[0]; 3052 n = ii[1] - ii[0]; 3053 ii++; 3054 for (j = 0; j < n; j++) { 3055 rval = ib[j] * 3; 3056 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3057 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3058 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3059 v += 9; 3060 } 3061 if (!usecprow) xb += 3; 3062 } 3063 break; 3064 case 4: 3065 for (i = 0; i < mbs; i++) { 3066 if (usecprow) xb = x + 4 * ridx[i]; 3067 x1 = xb[0]; 3068 x2 = xb[1]; 3069 x3 = xb[2]; 3070 x4 = xb[3]; 3071 ib = idx + ii[0]; 3072 n = ii[1] - ii[0]; 3073 ii++; 3074 for (j = 0; j < n; j++) { 3075 rval = ib[j] * 4; 3076 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3077 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3078 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3079 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3080 v += 16; 3081 } 3082 if (!usecprow) xb += 4; 3083 } 3084 break; 3085 case 5: 3086 for (i = 0; i < mbs; i++) { 3087 if (usecprow) xb = x + 5 * ridx[i]; 3088 x1 = xb[0]; 3089 x2 = xb[1]; 3090 x3 = xb[2]; 3091 x4 = xb[3]; 3092 x5 = xb[4]; 3093 ib = idx + ii[0]; 3094 n = ii[1] - ii[0]; 3095 ii++; 3096 for (j = 0; j < n; j++) { 3097 rval = ib[j] * 5; 3098 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3099 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3100 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3101 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3102 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3103 v += 25; 3104 } 3105 if (!usecprow) xb += 5; 3106 } 3107 break; 3108 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3109 PetscInt ncols, k; 3110 PetscScalar *work, *workt; 3111 const PetscScalar *xtmp; 3112 if (!a->mult_work) { 3113 k = PetscMax(A->rmap->n, A->cmap->n); 3114 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3115 } 3116 work = a->mult_work; 3117 xtmp = x; 3118 for (i = 0; i < mbs; i++) { 3119 n = ii[1] - ii[0]; 3120 ii++; 3121 ncols = n * bs; 3122 PetscCall(PetscArrayzero(work, ncols)); 3123 if (usecprow) xtmp = x + bs * ridx[i]; 3124 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3125 v += n * bs2; 3126 if (!usecprow) xtmp += bs; 3127 workt = work; 3128 for (j = 0; j < n; j++) { 3129 zb = z + bs * (*idx++); 3130 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3131 workt += bs; 3132 } 3133 } 3134 } 3135 } 3136 PetscCall(VecRestoreArrayRead(xx, &x)); 3137 PetscCall(VecRestoreArray(zz, &z)); 3138 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3139 PetscFunctionReturn(PETSC_SUCCESS); 3140 } 3141 3142 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3143 { 3144 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3145 PetscInt totalnz = a->bs2 * a->nz; 3146 PetscScalar oalpha = alpha; 3147 PetscBLASInt one = 1, tnz; 3148 3149 PetscFunctionBegin; 3150 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3151 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3152 PetscCall(PetscLogFlops(totalnz)); 3153 PetscFunctionReturn(PETSC_SUCCESS); 3154 } 3155 3156 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3157 { 3158 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3159 MatScalar *v = a->a; 3160 PetscReal sum = 0.0; 3161 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3162 3163 PetscFunctionBegin; 3164 if (type == NORM_FROBENIUS) { 3165 #if defined(PETSC_USE_REAL___FP16) 3166 PetscBLASInt one = 1, cnt = bs2 * nz; 3167 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3168 #else 3169 for (i = 0; i < bs2 * nz; i++) { 3170 sum += PetscRealPart(PetscConj(*v) * (*v)); 3171 v++; 3172 } 3173 #endif 3174 *norm = PetscSqrtReal(sum); 3175 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3176 } else if (type == NORM_1) { /* maximum column sum */ 3177 PetscReal *tmp; 3178 PetscInt *bcol = a->j; 3179 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3180 for (i = 0; i < nz; i++) { 3181 for (j = 0; j < bs; j++) { 3182 k1 = bs * (*bcol) + j; /* column index */ 3183 for (k = 0; k < bs; k++) { 3184 tmp[k1] += PetscAbsScalar(*v); 3185 v++; 3186 } 3187 } 3188 bcol++; 3189 } 3190 *norm = 0.0; 3191 for (j = 0; j < A->cmap->n; j++) { 3192 if (tmp[j] > *norm) *norm = tmp[j]; 3193 } 3194 PetscCall(PetscFree(tmp)); 3195 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3196 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3197 *norm = 0.0; 3198 for (k = 0; k < bs; k++) { 3199 for (j = 0; j < a->mbs; j++) { 3200 v = a->a + bs2 * a->i[j] + k; 3201 sum = 0.0; 3202 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3203 for (k1 = 0; k1 < bs; k1++) { 3204 sum += PetscAbsScalar(*v); 3205 v += bs; 3206 } 3207 } 3208 if (sum > *norm) *norm = sum; 3209 } 3210 } 3211 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3212 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3213 PetscFunctionReturn(PETSC_SUCCESS); 3214 } 3215 3216 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3217 { 3218 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3219 3220 PetscFunctionBegin; 3221 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3222 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3223 *flg = PETSC_FALSE; 3224 PetscFunctionReturn(PETSC_SUCCESS); 3225 } 3226 3227 /* if the a->i are the same */ 3228 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3229 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3230 3231 /* if a->j are the same */ 3232 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3233 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3234 3235 /* if a->a are the same */ 3236 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3237 PetscFunctionReturn(PETSC_SUCCESS); 3238 } 3239 3240 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3241 { 3242 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3243 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3244 PetscScalar *x, zero = 0.0; 3245 MatScalar *aa, *aa_j; 3246 3247 PetscFunctionBegin; 3248 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3249 bs = A->rmap->bs; 3250 aa = a->a; 3251 ai = a->i; 3252 aj = a->j; 3253 ambs = a->mbs; 3254 bs2 = a->bs2; 3255 3256 PetscCall(VecSet(v, zero)); 3257 PetscCall(VecGetArray(v, &x)); 3258 PetscCall(VecGetLocalSize(v, &n)); 3259 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3260 for (i = 0; i < ambs; i++) { 3261 for (j = ai[i]; j < ai[i + 1]; j++) { 3262 if (aj[j] == i) { 3263 row = i * bs; 3264 aa_j = aa + j * bs2; 3265 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3266 break; 3267 } 3268 } 3269 } 3270 PetscCall(VecRestoreArray(v, &x)); 3271 PetscFunctionReturn(PETSC_SUCCESS); 3272 } 3273 3274 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3275 { 3276 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3277 const PetscScalar *l, *r, *li, *ri; 3278 PetscScalar x; 3279 MatScalar *aa, *v; 3280 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3281 const PetscInt *ai, *aj; 3282 3283 PetscFunctionBegin; 3284 ai = a->i; 3285 aj = a->j; 3286 aa = a->a; 3287 m = A->rmap->n; 3288 n = A->cmap->n; 3289 bs = A->rmap->bs; 3290 mbs = a->mbs; 3291 bs2 = a->bs2; 3292 if (ll) { 3293 PetscCall(VecGetArrayRead(ll, &l)); 3294 PetscCall(VecGetLocalSize(ll, &lm)); 3295 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3296 for (i = 0; i < mbs; i++) { /* for each block row */ 3297 M = ai[i + 1] - ai[i]; 3298 li = l + i * bs; 3299 v = aa + bs2 * ai[i]; 3300 for (j = 0; j < M; j++) { /* for each block */ 3301 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 3302 } 3303 } 3304 PetscCall(VecRestoreArrayRead(ll, &l)); 3305 PetscCall(PetscLogFlops(a->nz)); 3306 } 3307 3308 if (rr) { 3309 PetscCall(VecGetArrayRead(rr, &r)); 3310 PetscCall(VecGetLocalSize(rr, &rn)); 3311 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3312 for (i = 0; i < mbs; i++) { /* for each block row */ 3313 iai = ai[i]; 3314 M = ai[i + 1] - iai; 3315 v = aa + bs2 * iai; 3316 for (j = 0; j < M; j++) { /* for each block */ 3317 ri = r + bs * aj[iai + j]; 3318 for (k = 0; k < bs; k++) { 3319 x = ri[k]; 3320 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3321 v += bs; 3322 } 3323 } 3324 } 3325 PetscCall(VecRestoreArrayRead(rr, &r)); 3326 PetscCall(PetscLogFlops(a->nz)); 3327 } 3328 PetscFunctionReturn(PETSC_SUCCESS); 3329 } 3330 3331 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3332 { 3333 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3334 3335 PetscFunctionBegin; 3336 info->block_size = a->bs2; 3337 info->nz_allocated = a->bs2 * a->maxnz; 3338 info->nz_used = a->bs2 * a->nz; 3339 info->nz_unneeded = info->nz_allocated - info->nz_used; 3340 info->assemblies = A->num_ass; 3341 info->mallocs = A->info.mallocs; 3342 info->memory = 0; /* REVIEW ME */ 3343 if (A->factortype) { 3344 info->fill_ratio_given = A->info.fill_ratio_given; 3345 info->fill_ratio_needed = A->info.fill_ratio_needed; 3346 info->factor_mallocs = A->info.factor_mallocs; 3347 } else { 3348 info->fill_ratio_given = 0; 3349 info->fill_ratio_needed = 0; 3350 info->factor_mallocs = 0; 3351 } 3352 PetscFunctionReturn(PETSC_SUCCESS); 3353 } 3354 3355 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3356 { 3357 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3358 3359 PetscFunctionBegin; 3360 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3361 PetscFunctionReturn(PETSC_SUCCESS); 3362 } 3363 3364 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3365 { 3366 PetscFunctionBegin; 3367 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3368 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3369 PetscFunctionReturn(PETSC_SUCCESS); 3370 } 3371 3372 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3373 { 3374 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3375 PetscScalar *z = NULL, sum1; 3376 const PetscScalar *xb; 3377 PetscScalar x1; 3378 const MatScalar *v, *vv; 3379 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3380 PetscBool usecprow = a->compressedrow.use; 3381 3382 PetscFunctionBegin; 3383 idx = a->j; 3384 v = a->a; 3385 if (usecprow) { 3386 mbs = a->compressedrow.nrows; 3387 ii = a->compressedrow.i; 3388 ridx = a->compressedrow.rindex; 3389 } else { 3390 mbs = a->mbs; 3391 ii = a->i; 3392 z = c; 3393 } 3394 3395 for (i = 0; i < mbs; i++) { 3396 n = ii[1] - ii[0]; 3397 ii++; 3398 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3399 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3400 if (usecprow) z = c + ridx[i]; 3401 jj = idx; 3402 vv = v; 3403 for (k = 0; k < cn; k++) { 3404 idx = jj; 3405 v = vv; 3406 sum1 = 0.0; 3407 for (j = 0; j < n; j++) { 3408 xb = b + (*idx++); 3409 x1 = xb[0 + k * bm]; 3410 sum1 += v[0] * x1; 3411 v += 1; 3412 } 3413 z[0 + k * cm] = sum1; 3414 } 3415 if (!usecprow) z += 1; 3416 } 3417 PetscFunctionReturn(PETSC_SUCCESS); 3418 } 3419 3420 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3421 { 3422 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3423 PetscScalar *z = NULL, sum1, sum2; 3424 const PetscScalar *xb; 3425 PetscScalar x1, x2; 3426 const MatScalar *v, *vv; 3427 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3428 PetscBool usecprow = a->compressedrow.use; 3429 3430 PetscFunctionBegin; 3431 idx = a->j; 3432 v = a->a; 3433 if (usecprow) { 3434 mbs = a->compressedrow.nrows; 3435 ii = a->compressedrow.i; 3436 ridx = a->compressedrow.rindex; 3437 } else { 3438 mbs = a->mbs; 3439 ii = a->i; 3440 z = c; 3441 } 3442 3443 for (i = 0; i < mbs; i++) { 3444 n = ii[1] - ii[0]; 3445 ii++; 3446 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3447 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3448 if (usecprow) z = c + 2 * ridx[i]; 3449 jj = idx; 3450 vv = v; 3451 for (k = 0; k < cn; k++) { 3452 idx = jj; 3453 v = vv; 3454 sum1 = 0.0; 3455 sum2 = 0.0; 3456 for (j = 0; j < n; j++) { 3457 xb = b + 2 * (*idx++); 3458 x1 = xb[0 + k * bm]; 3459 x2 = xb[1 + k * bm]; 3460 sum1 += v[0] * x1 + v[2] * x2; 3461 sum2 += v[1] * x1 + v[3] * x2; 3462 v += 4; 3463 } 3464 z[0 + k * cm] = sum1; 3465 z[1 + k * cm] = sum2; 3466 } 3467 if (!usecprow) z += 2; 3468 } 3469 PetscFunctionReturn(PETSC_SUCCESS); 3470 } 3471 3472 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3473 { 3474 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3475 PetscScalar *z = NULL, sum1, sum2, sum3; 3476 const PetscScalar *xb; 3477 PetscScalar x1, x2, x3; 3478 const MatScalar *v, *vv; 3479 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3480 PetscBool usecprow = a->compressedrow.use; 3481 3482 PetscFunctionBegin; 3483 idx = a->j; 3484 v = a->a; 3485 if (usecprow) { 3486 mbs = a->compressedrow.nrows; 3487 ii = a->compressedrow.i; 3488 ridx = a->compressedrow.rindex; 3489 } else { 3490 mbs = a->mbs; 3491 ii = a->i; 3492 z = c; 3493 } 3494 3495 for (i = 0; i < mbs; i++) { 3496 n = ii[1] - ii[0]; 3497 ii++; 3498 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3499 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3500 if (usecprow) z = c + 3 * ridx[i]; 3501 jj = idx; 3502 vv = v; 3503 for (k = 0; k < cn; k++) { 3504 idx = jj; 3505 v = vv; 3506 sum1 = 0.0; 3507 sum2 = 0.0; 3508 sum3 = 0.0; 3509 for (j = 0; j < n; j++) { 3510 xb = b + 3 * (*idx++); 3511 x1 = xb[0 + k * bm]; 3512 x2 = xb[1 + k * bm]; 3513 x3 = xb[2 + k * bm]; 3514 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3515 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3516 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3517 v += 9; 3518 } 3519 z[0 + k * cm] = sum1; 3520 z[1 + k * cm] = sum2; 3521 z[2 + k * cm] = sum3; 3522 } 3523 if (!usecprow) z += 3; 3524 } 3525 PetscFunctionReturn(PETSC_SUCCESS); 3526 } 3527 3528 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3529 { 3530 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3531 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3532 const PetscScalar *xb; 3533 PetscScalar x1, x2, x3, x4; 3534 const MatScalar *v, *vv; 3535 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3536 PetscBool usecprow = a->compressedrow.use; 3537 3538 PetscFunctionBegin; 3539 idx = a->j; 3540 v = a->a; 3541 if (usecprow) { 3542 mbs = a->compressedrow.nrows; 3543 ii = a->compressedrow.i; 3544 ridx = a->compressedrow.rindex; 3545 } else { 3546 mbs = a->mbs; 3547 ii = a->i; 3548 z = c; 3549 } 3550 3551 for (i = 0; i < mbs; i++) { 3552 n = ii[1] - ii[0]; 3553 ii++; 3554 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3555 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3556 if (usecprow) z = c + 4 * ridx[i]; 3557 jj = idx; 3558 vv = v; 3559 for (k = 0; k < cn; k++) { 3560 idx = jj; 3561 v = vv; 3562 sum1 = 0.0; 3563 sum2 = 0.0; 3564 sum3 = 0.0; 3565 sum4 = 0.0; 3566 for (j = 0; j < n; j++) { 3567 xb = b + 4 * (*idx++); 3568 x1 = xb[0 + k * bm]; 3569 x2 = xb[1 + k * bm]; 3570 x3 = xb[2 + k * bm]; 3571 x4 = xb[3 + k * bm]; 3572 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3573 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3574 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3575 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3576 v += 16; 3577 } 3578 z[0 + k * cm] = sum1; 3579 z[1 + k * cm] = sum2; 3580 z[2 + k * cm] = sum3; 3581 z[3 + k * cm] = sum4; 3582 } 3583 if (!usecprow) z += 4; 3584 } 3585 PetscFunctionReturn(PETSC_SUCCESS); 3586 } 3587 3588 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3589 { 3590 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3591 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3592 const PetscScalar *xb; 3593 PetscScalar x1, x2, x3, x4, x5; 3594 const MatScalar *v, *vv; 3595 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3596 PetscBool usecprow = a->compressedrow.use; 3597 3598 PetscFunctionBegin; 3599 idx = a->j; 3600 v = a->a; 3601 if (usecprow) { 3602 mbs = a->compressedrow.nrows; 3603 ii = a->compressedrow.i; 3604 ridx = a->compressedrow.rindex; 3605 } else { 3606 mbs = a->mbs; 3607 ii = a->i; 3608 z = c; 3609 } 3610 3611 for (i = 0; i < mbs; i++) { 3612 n = ii[1] - ii[0]; 3613 ii++; 3614 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3615 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3616 if (usecprow) z = c + 5 * ridx[i]; 3617 jj = idx; 3618 vv = v; 3619 for (k = 0; k < cn; k++) { 3620 idx = jj; 3621 v = vv; 3622 sum1 = 0.0; 3623 sum2 = 0.0; 3624 sum3 = 0.0; 3625 sum4 = 0.0; 3626 sum5 = 0.0; 3627 for (j = 0; j < n; j++) { 3628 xb = b + 5 * (*idx++); 3629 x1 = xb[0 + k * bm]; 3630 x2 = xb[1 + k * bm]; 3631 x3 = xb[2 + k * bm]; 3632 x4 = xb[3 + k * bm]; 3633 x5 = xb[4 + k * bm]; 3634 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3635 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3636 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3637 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3638 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3639 v += 25; 3640 } 3641 z[0 + k * cm] = sum1; 3642 z[1 + k * cm] = sum2; 3643 z[2 + k * cm] = sum3; 3644 z[3 + k * cm] = sum4; 3645 z[4 + k * cm] = sum5; 3646 } 3647 if (!usecprow) z += 5; 3648 } 3649 PetscFunctionReturn(PETSC_SUCCESS); 3650 } 3651 3652 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3653 { 3654 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3655 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3656 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3657 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3658 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3659 PetscBLASInt bbs, bcn, bbm, bcm; 3660 PetscScalar *z = NULL; 3661 PetscScalar *c, *b; 3662 const MatScalar *v; 3663 const PetscInt *idx, *ii, *ridx = NULL; 3664 PetscScalar _DZero = 0.0, _DOne = 1.0; 3665 PetscBool usecprow = a->compressedrow.use; 3666 3667 PetscFunctionBegin; 3668 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 3669 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3670 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3671 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3672 b = bd->v; 3673 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 3674 PetscCall(MatDenseGetArray(C, &c)); 3675 switch (bs) { 3676 case 1: 3677 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3678 break; 3679 case 2: 3680 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3681 break; 3682 case 3: 3683 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3684 break; 3685 case 4: 3686 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3687 break; 3688 case 5: 3689 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3690 break; 3691 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3692 PetscCall(PetscBLASIntCast(bs, &bbs)); 3693 PetscCall(PetscBLASIntCast(cn, &bcn)); 3694 PetscCall(PetscBLASIntCast(bm, &bbm)); 3695 PetscCall(PetscBLASIntCast(cm, &bcm)); 3696 idx = a->j; 3697 v = a->a; 3698 if (usecprow) { 3699 mbs = a->compressedrow.nrows; 3700 ii = a->compressedrow.i; 3701 ridx = a->compressedrow.rindex; 3702 } else { 3703 mbs = a->mbs; 3704 ii = a->i; 3705 z = c; 3706 } 3707 for (i = 0; i < mbs; i++) { 3708 n = ii[1] - ii[0]; 3709 ii++; 3710 if (usecprow) z = c + bs * ridx[i]; 3711 if (n) { 3712 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3713 v += bs2; 3714 } 3715 for (j = 1; j < n; j++) { 3716 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3717 v += bs2; 3718 } 3719 if (!usecprow) z += bs; 3720 } 3721 } 3722 PetscCall(MatDenseRestoreArray(C, &c)); 3723 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3724 PetscFunctionReturn(PETSC_SUCCESS); 3725 } 3726