1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #elif defined(PETSC_HAVE_XMMINTRIN_H) 10 #include <xmmintrin.h> 11 #endif 12 13 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 14 { 15 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 16 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 17 const PetscInt *idx; 18 PetscInt start, end, *ai, *aj, bs; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 28 29 PetscCall(PetscBTCreate(m, &table)); 30 PetscCall(PetscMalloc1(m + 1, &nidx)); 31 32 for (i = 0; i < is_max; i++) { 33 /* Initialise the two local arrays */ 34 isz = 0; 35 PetscCall(PetscBTMemzero(m, table)); 36 37 /* Extract the indices, assume there can be duplicate entries */ 38 PetscCall(ISGetIndices(is[i], &idx)); 39 PetscCall(ISGetLocalSize(is[i], &n)); 40 41 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 42 for (j = 0; j < n; ++j) { 43 ival = idx[j] / bs; /* convert the indices into block indices */ 44 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 45 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 46 } 47 PetscCall(ISRestoreIndices(is[i], &idx)); 48 PetscCall(ISDestroy(&is[i])); 49 50 k = 0; 51 for (j = 0; j < ov; j++) { /* for each overlap*/ 52 n = isz; 53 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 54 row = nidx[k]; 55 start = ai[row]; 56 end = ai[row + 1]; 57 for (l = start; l < end; l++) { 58 val = aj[l]; 59 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 60 } 61 } 62 } 63 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 64 } 65 PetscCall(PetscBTDestroy(&table)); 66 PetscCall(PetscFree(nidx)); 67 PetscFunctionReturn(PETSC_SUCCESS); 68 } 69 70 static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 71 { 72 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 73 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 74 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 75 const PetscInt *irow, *icol; 76 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 77 PetscInt *aj = a->j, *ai = a->i; 78 MatScalar *mat_a; 79 Mat C; 80 PetscBool flag; 81 82 PetscFunctionBegin; 83 PetscCall(ISGetIndices(isrow, &irow)); 84 PetscCall(ISGetIndices(iscol, &icol)); 85 PetscCall(ISGetLocalSize(isrow, &nrows)); 86 PetscCall(ISGetLocalSize(iscol, &ncols)); 87 88 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 89 ssmap = smap; 90 PetscCall(PetscMalloc1(1 + nrows, &lens)); 91 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 92 /* determine lens of each row */ 93 for (i = 0; i < nrows; i++) { 94 kstart = ai[irow[i]]; 95 kend = kstart + a->ilen[irow[i]]; 96 lens[i] = 0; 97 for (k = kstart; k < kend; k++) { 98 if (ssmap[aj[k]]) lens[i]++; 99 } 100 } 101 /* Create and fill new matrix */ 102 if (scall == MAT_REUSE_MATRIX) { 103 c = (Mat_SeqBAIJ *)((*B)->data); 104 105 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 106 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 107 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 108 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 109 C = *B; 110 } else { 111 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 112 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 113 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 114 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 115 } 116 c = (Mat_SeqBAIJ *)C->data; 117 for (i = 0; i < nrows; i++) { 118 row = irow[i]; 119 kstart = ai[row]; 120 kend = kstart + a->ilen[row]; 121 mat_i = c->i[i]; 122 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 123 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 124 mat_ilen = c->ilen + i; 125 for (k = kstart; k < kend; k++) { 126 if ((tcol = ssmap[a->j[k]])) { 127 *mat_j++ = tcol - 1; 128 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 129 mat_a += bs2; 130 (*mat_ilen)++; 131 } 132 } 133 } 134 /* sort */ 135 if (c->j && c->a) { 136 MatScalar *work; 137 PetscCall(PetscMalloc1(bs2, &work)); 138 for (i = 0; i < nrows; i++) { 139 PetscInt ilen; 140 mat_i = c->i[i]; 141 mat_j = c->j + mat_i; 142 mat_a = c->a + mat_i * bs2; 143 ilen = c->ilen[i]; 144 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 145 } 146 PetscCall(PetscFree(work)); 147 } 148 149 /* Free work space */ 150 PetscCall(ISRestoreIndices(iscol, &icol)); 151 PetscCall(PetscFree(smap)); 152 PetscCall(PetscFree(lens)); 153 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 154 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 155 156 PetscCall(ISRestoreIndices(isrow, &irow)); 157 *B = C; 158 PetscFunctionReturn(PETSC_SUCCESS); 159 } 160 161 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 162 { 163 IS is1, is2; 164 165 PetscFunctionBegin; 166 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 167 if (isrow == iscol) { 168 is2 = is1; 169 PetscCall(PetscObjectReference((PetscObject)is2)); 170 } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 171 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 172 PetscCall(ISDestroy(&is1)); 173 PetscCall(ISDestroy(&is2)); 174 PetscFunctionReturn(PETSC_SUCCESS); 175 } 176 177 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 178 { 179 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 180 Mat_SubSppt *submatj = c->submatis1; 181 182 PetscFunctionBegin; 183 PetscCall((*submatj->destroy)(C)); 184 PetscCall(MatDestroySubMatrix_Private(submatj)); 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 189 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 190 { 191 PetscInt i; 192 Mat C; 193 Mat_SeqBAIJ *c; 194 Mat_SubSppt *submatj; 195 196 PetscFunctionBegin; 197 for (i = 0; i < n; i++) { 198 C = (*mat)[i]; 199 c = (Mat_SeqBAIJ *)C->data; 200 submatj = c->submatis1; 201 if (submatj) { 202 if (--((PetscObject)C)->refct <= 0) { 203 PetscCall(PetscFree(C->factorprefix)); 204 PetscCall((*submatj->destroy)(C)); 205 PetscCall(MatDestroySubMatrix_Private(submatj)); 206 PetscCall(PetscFree(C->defaultvectype)); 207 PetscCall(PetscFree(C->defaultrandtype)); 208 PetscCall(PetscLayoutDestroy(&C->rmap)); 209 PetscCall(PetscLayoutDestroy(&C->cmap)); 210 PetscCall(PetscHeaderDestroy(&C)); 211 } 212 } else { 213 PetscCall(MatDestroy(&C)); 214 } 215 } 216 217 /* Destroy Dummy submatrices created for reuse */ 218 PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 219 220 PetscCall(PetscFree(*mat)); 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 225 { 226 PetscInt i; 227 228 PetscFunctionBegin; 229 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 230 231 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 /* Should check that shapes of vectors and matrices match */ 236 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 237 { 238 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 239 PetscScalar *z, sum; 240 const PetscScalar *x; 241 const MatScalar *v; 242 PetscInt mbs, i, n; 243 const PetscInt *idx, *ii, *ridx = NULL; 244 PetscBool usecprow = a->compressedrow.use; 245 246 PetscFunctionBegin; 247 PetscCall(VecGetArrayRead(xx, &x)); 248 PetscCall(VecGetArrayWrite(zz, &z)); 249 250 if (usecprow) { 251 mbs = a->compressedrow.nrows; 252 ii = a->compressedrow.i; 253 ridx = a->compressedrow.rindex; 254 PetscCall(PetscArrayzero(z, a->mbs)); 255 } else { 256 mbs = a->mbs; 257 ii = a->i; 258 } 259 260 for (i = 0; i < mbs; i++) { 261 n = ii[1] - ii[0]; 262 v = a->a + ii[0]; 263 idx = a->j + ii[0]; 264 ii++; 265 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 266 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 267 sum = 0.0; 268 PetscSparseDensePlusDot(sum, x, v, idx, n); 269 if (usecprow) { 270 z[ridx[i]] = sum; 271 } else { 272 z[i] = sum; 273 } 274 } 275 PetscCall(VecRestoreArrayRead(xx, &x)); 276 PetscCall(VecRestoreArrayWrite(zz, &z)); 277 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 278 PetscFunctionReturn(PETSC_SUCCESS); 279 } 280 281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 284 PetscScalar *z = NULL, sum1, sum2, *zarray; 285 const PetscScalar *x, *xb; 286 PetscScalar x1, x2; 287 const MatScalar *v; 288 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 289 PetscBool usecprow = a->compressedrow.use; 290 291 PetscFunctionBegin; 292 PetscCall(VecGetArrayRead(xx, &x)); 293 PetscCall(VecGetArrayWrite(zz, &zarray)); 294 295 idx = a->j; 296 v = a->a; 297 if (usecprow) { 298 mbs = a->compressedrow.nrows; 299 ii = a->compressedrow.i; 300 ridx = a->compressedrow.rindex; 301 PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 302 } else { 303 mbs = a->mbs; 304 ii = a->i; 305 z = zarray; 306 } 307 308 for (i = 0; i < mbs; i++) { 309 n = ii[1] - ii[0]; 310 ii++; 311 sum1 = 0.0; 312 sum2 = 0.0; 313 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 314 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 315 for (j = 0; j < n; j++) { 316 xb = x + 2 * (*idx++); 317 x1 = xb[0]; 318 x2 = xb[1]; 319 sum1 += v[0] * x1 + v[2] * x2; 320 sum2 += v[1] * x1 + v[3] * x2; 321 v += 4; 322 } 323 if (usecprow) z = zarray + 2 * ridx[i]; 324 z[0] = sum1; 325 z[1] = sum2; 326 if (!usecprow) z += 2; 327 } 328 PetscCall(VecRestoreArrayRead(xx, &x)); 329 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 330 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 331 PetscFunctionReturn(PETSC_SUCCESS); 332 } 333 334 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 335 { 336 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 337 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 338 const PetscScalar *x, *xb; 339 const MatScalar *v; 340 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 341 PetscBool usecprow = a->compressedrow.use; 342 343 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 344 #pragma disjoint(*v, *z, *xb) 345 #endif 346 347 PetscFunctionBegin; 348 PetscCall(VecGetArrayRead(xx, &x)); 349 PetscCall(VecGetArrayWrite(zz, &zarray)); 350 351 idx = a->j; 352 v = a->a; 353 if (usecprow) { 354 mbs = a->compressedrow.nrows; 355 ii = a->compressedrow.i; 356 ridx = a->compressedrow.rindex; 357 PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 358 } else { 359 mbs = a->mbs; 360 ii = a->i; 361 z = zarray; 362 } 363 364 for (i = 0; i < mbs; i++) { 365 n = ii[1] - ii[0]; 366 ii++; 367 sum1 = 0.0; 368 sum2 = 0.0; 369 sum3 = 0.0; 370 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 371 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 372 for (j = 0; j < n; j++) { 373 xb = x + 3 * (*idx++); 374 x1 = xb[0]; 375 x2 = xb[1]; 376 x3 = xb[2]; 377 378 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 379 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 380 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 381 v += 9; 382 } 383 if (usecprow) z = zarray + 3 * ridx[i]; 384 z[0] = sum1; 385 z[1] = sum2; 386 z[2] = sum3; 387 if (!usecprow) z += 3; 388 } 389 PetscCall(VecRestoreArrayRead(xx, &x)); 390 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 391 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 392 PetscFunctionReturn(PETSC_SUCCESS); 393 } 394 395 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 396 { 397 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 398 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 399 const PetscScalar *x, *xb; 400 const MatScalar *v; 401 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 402 PetscBool usecprow = a->compressedrow.use; 403 404 PetscFunctionBegin; 405 PetscCall(VecGetArrayRead(xx, &x)); 406 PetscCall(VecGetArrayWrite(zz, &zarray)); 407 408 idx = a->j; 409 v = a->a; 410 if (usecprow) { 411 mbs = a->compressedrow.nrows; 412 ii = a->compressedrow.i; 413 ridx = a->compressedrow.rindex; 414 PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 415 } else { 416 mbs = a->mbs; 417 ii = a->i; 418 z = zarray; 419 } 420 421 for (i = 0; i < mbs; i++) { 422 n = ii[1] - ii[0]; 423 ii++; 424 sum1 = 0.0; 425 sum2 = 0.0; 426 sum3 = 0.0; 427 sum4 = 0.0; 428 429 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 430 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 431 for (j = 0; j < n; j++) { 432 xb = x + 4 * (*idx++); 433 x1 = xb[0]; 434 x2 = xb[1]; 435 x3 = xb[2]; 436 x4 = xb[3]; 437 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 438 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 439 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 440 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 441 v += 16; 442 } 443 if (usecprow) z = zarray + 4 * ridx[i]; 444 z[0] = sum1; 445 z[1] = sum2; 446 z[2] = sum3; 447 z[3] = sum4; 448 if (!usecprow) z += 4; 449 } 450 PetscCall(VecRestoreArrayRead(xx, &x)); 451 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 452 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 453 PetscFunctionReturn(PETSC_SUCCESS); 454 } 455 456 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 457 { 458 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 459 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 460 const PetscScalar *xb, *x; 461 const MatScalar *v; 462 const PetscInt *idx, *ii, *ridx = NULL; 463 PetscInt mbs, i, j, n; 464 PetscBool usecprow = a->compressedrow.use; 465 466 PetscFunctionBegin; 467 PetscCall(VecGetArrayRead(xx, &x)); 468 PetscCall(VecGetArrayWrite(zz, &zarray)); 469 470 idx = a->j; 471 v = a->a; 472 if (usecprow) { 473 mbs = a->compressedrow.nrows; 474 ii = a->compressedrow.i; 475 ridx = a->compressedrow.rindex; 476 PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 477 } else { 478 mbs = a->mbs; 479 ii = a->i; 480 z = zarray; 481 } 482 483 for (i = 0; i < mbs; i++) { 484 n = ii[1] - ii[0]; 485 ii++; 486 sum1 = 0.0; 487 sum2 = 0.0; 488 sum3 = 0.0; 489 sum4 = 0.0; 490 sum5 = 0.0; 491 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 492 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 493 for (j = 0; j < n; j++) { 494 xb = x + 5 * (*idx++); 495 x1 = xb[0]; 496 x2 = xb[1]; 497 x3 = xb[2]; 498 x4 = xb[3]; 499 x5 = xb[4]; 500 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 501 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 502 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 503 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 504 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 505 v += 25; 506 } 507 if (usecprow) z = zarray + 5 * ridx[i]; 508 z[0] = sum1; 509 z[1] = sum2; 510 z[2] = sum3; 511 z[3] = sum4; 512 z[4] = sum5; 513 if (!usecprow) z += 5; 514 } 515 PetscCall(VecRestoreArrayRead(xx, &x)); 516 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 517 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 518 PetscFunctionReturn(PETSC_SUCCESS); 519 } 520 521 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 522 { 523 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 524 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 525 const PetscScalar *x, *xb; 526 PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 527 const MatScalar *v; 528 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 529 PetscBool usecprow = a->compressedrow.use; 530 531 PetscFunctionBegin; 532 PetscCall(VecGetArrayRead(xx, &x)); 533 PetscCall(VecGetArrayWrite(zz, &zarray)); 534 535 idx = a->j; 536 v = a->a; 537 if (usecprow) { 538 mbs = a->compressedrow.nrows; 539 ii = a->compressedrow.i; 540 ridx = a->compressedrow.rindex; 541 PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 542 } else { 543 mbs = a->mbs; 544 ii = a->i; 545 z = zarray; 546 } 547 548 for (i = 0; i < mbs; i++) { 549 n = ii[1] - ii[0]; 550 ii++; 551 sum1 = 0.0; 552 sum2 = 0.0; 553 sum3 = 0.0; 554 sum4 = 0.0; 555 sum5 = 0.0; 556 sum6 = 0.0; 557 558 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 559 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 560 for (j = 0; j < n; j++) { 561 xb = x + 6 * (*idx++); 562 x1 = xb[0]; 563 x2 = xb[1]; 564 x3 = xb[2]; 565 x4 = xb[3]; 566 x5 = xb[4]; 567 x6 = xb[5]; 568 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 569 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 570 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 571 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 572 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 573 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 574 v += 36; 575 } 576 if (usecprow) z = zarray + 6 * ridx[i]; 577 z[0] = sum1; 578 z[1] = sum2; 579 z[2] = sum3; 580 z[3] = sum4; 581 z[4] = sum5; 582 z[5] = sum6; 583 if (!usecprow) z += 6; 584 } 585 586 PetscCall(VecRestoreArrayRead(xx, &x)); 587 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 588 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 589 PetscFunctionReturn(PETSC_SUCCESS); 590 } 591 592 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 593 { 594 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 595 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 596 const PetscScalar *x, *xb; 597 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 598 const MatScalar *v; 599 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 600 PetscBool usecprow = a->compressedrow.use; 601 602 PetscFunctionBegin; 603 PetscCall(VecGetArrayRead(xx, &x)); 604 PetscCall(VecGetArrayWrite(zz, &zarray)); 605 606 idx = a->j; 607 v = a->a; 608 if (usecprow) { 609 mbs = a->compressedrow.nrows; 610 ii = a->compressedrow.i; 611 ridx = a->compressedrow.rindex; 612 PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 613 } else { 614 mbs = a->mbs; 615 ii = a->i; 616 z = zarray; 617 } 618 619 for (i = 0; i < mbs; i++) { 620 n = ii[1] - ii[0]; 621 ii++; 622 sum1 = 0.0; 623 sum2 = 0.0; 624 sum3 = 0.0; 625 sum4 = 0.0; 626 sum5 = 0.0; 627 sum6 = 0.0; 628 sum7 = 0.0; 629 630 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 631 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 632 for (j = 0; j < n; j++) { 633 xb = x + 7 * (*idx++); 634 x1 = xb[0]; 635 x2 = xb[1]; 636 x3 = xb[2]; 637 x4 = xb[3]; 638 x5 = xb[4]; 639 x6 = xb[5]; 640 x7 = xb[6]; 641 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 642 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 643 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 644 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 645 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 646 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 647 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 648 v += 49; 649 } 650 if (usecprow) z = zarray + 7 * ridx[i]; 651 z[0] = sum1; 652 z[1] = sum2; 653 z[2] = sum3; 654 z[3] = sum4; 655 z[4] = sum5; 656 z[5] = sum6; 657 z[6] = sum7; 658 if (!usecprow) z += 7; 659 } 660 661 PetscCall(VecRestoreArrayRead(xx, &x)); 662 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 663 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 664 PetscFunctionReturn(PETSC_SUCCESS); 665 } 666 667 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 668 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 669 { 670 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 671 PetscScalar *z = NULL, *work, *workt, *zarray; 672 const PetscScalar *x, *xb; 673 const MatScalar *v; 674 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 675 const PetscInt *idx, *ii, *ridx = NULL; 676 PetscInt k; 677 PetscBool usecprow = a->compressedrow.use; 678 679 __m256d a0, a1, a2, a3, a4, a5; 680 __m256d w0, w1, w2, w3; 681 __m256d z0, z1, z2; 682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 683 684 PetscFunctionBegin; 685 PetscCall(VecGetArrayRead(xx, &x)); 686 PetscCall(VecGetArrayWrite(zz, &zarray)); 687 688 idx = a->j; 689 v = a->a; 690 if (usecprow) { 691 mbs = a->compressedrow.nrows; 692 ii = a->compressedrow.i; 693 ridx = a->compressedrow.rindex; 694 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 if (!a->mult_work) { 702 k = PetscMax(A->rmap->n, A->cmap->n); 703 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 704 } 705 706 work = a->mult_work; 707 for (i = 0; i < mbs; i++) { 708 n = ii[1] - ii[0]; 709 ii++; 710 workt = work; 711 for (j = 0; j < n; j++) { 712 xb = x + bs * (*idx++); 713 for (k = 0; k < bs; k++) workt[k] = xb[k]; 714 workt += bs; 715 } 716 if (usecprow) z = zarray + bs * ridx[i]; 717 718 z0 = _mm256_setzero_pd(); 719 z1 = _mm256_setzero_pd(); 720 z2 = _mm256_setzero_pd(); 721 722 for (j = 0; j < n; j++) { 723 /* first column of a */ 724 w0 = _mm256_set1_pd(work[j * 9]); 725 a0 = _mm256_loadu_pd(&v[j * 81]); 726 z0 = _mm256_fmadd_pd(a0, w0, z0); 727 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 728 z1 = _mm256_fmadd_pd(a1, w0, z1); 729 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 730 z2 = _mm256_fmadd_pd(a2, w0, z2); 731 732 /* second column of a */ 733 w1 = _mm256_set1_pd(work[j * 9 + 1]); 734 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 735 z0 = _mm256_fmadd_pd(a0, w1, z0); 736 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 737 z1 = _mm256_fmadd_pd(a1, w1, z1); 738 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 739 z2 = _mm256_fmadd_pd(a2, w1, z2); 740 741 /* third column of a */ 742 w2 = _mm256_set1_pd(work[j * 9 + 2]); 743 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 744 z0 = _mm256_fmadd_pd(a3, w2, z0); 745 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 746 z1 = _mm256_fmadd_pd(a4, w2, z1); 747 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 748 z2 = _mm256_fmadd_pd(a5, w2, z2); 749 750 /* fourth column of a */ 751 w3 = _mm256_set1_pd(work[j * 9 + 3]); 752 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 753 z0 = _mm256_fmadd_pd(a0, w3, z0); 754 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 755 z1 = _mm256_fmadd_pd(a1, w3, z1); 756 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 757 z2 = _mm256_fmadd_pd(a2, w3, z2); 758 759 /* fifth column of a */ 760 w0 = _mm256_set1_pd(work[j * 9 + 4]); 761 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 762 z0 = _mm256_fmadd_pd(a3, w0, z0); 763 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 764 z1 = _mm256_fmadd_pd(a4, w0, z1); 765 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 766 z2 = _mm256_fmadd_pd(a5, w0, z2); 767 768 /* sixth column of a */ 769 w1 = _mm256_set1_pd(work[j * 9 + 5]); 770 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 771 z0 = _mm256_fmadd_pd(a0, w1, z0); 772 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 773 z1 = _mm256_fmadd_pd(a1, w1, z1); 774 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 775 z2 = _mm256_fmadd_pd(a2, w1, z2); 776 777 /* seventh column of a */ 778 w2 = _mm256_set1_pd(work[j * 9 + 6]); 779 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 780 z0 = _mm256_fmadd_pd(a0, w2, z0); 781 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 782 z1 = _mm256_fmadd_pd(a1, w2, z1); 783 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 784 z2 = _mm256_fmadd_pd(a2, w2, z2); 785 786 /* eighth column of a */ 787 w3 = _mm256_set1_pd(work[j * 9 + 7]); 788 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 789 z0 = _mm256_fmadd_pd(a3, w3, z0); 790 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 791 z1 = _mm256_fmadd_pd(a4, w3, z1); 792 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 793 z2 = _mm256_fmadd_pd(a5, w3, z2); 794 795 /* ninth column of a */ 796 w0 = _mm256_set1_pd(work[j * 9 + 8]); 797 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 798 z0 = _mm256_fmadd_pd(a0, w0, z0); 799 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 800 z1 = _mm256_fmadd_pd(a1, w0, z1); 801 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 802 z2 = _mm256_fmadd_pd(a2, w0, z2); 803 } 804 805 _mm256_storeu_pd(&z[0], z0); 806 _mm256_storeu_pd(&z[4], z1); 807 _mm256_maskstore_pd(&z[8], mask1, z2); 808 809 v += n * bs2; 810 if (!usecprow) z += bs; 811 } 812 PetscCall(VecRestoreArrayRead(xx, &x)); 813 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 814 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 815 PetscFunctionReturn(PETSC_SUCCESS); 816 } 817 #endif 818 819 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 820 { 821 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 822 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 823 const PetscScalar *x, *xb; 824 PetscScalar *zarray, xv; 825 const MatScalar *v; 826 const PetscInt *ii, *ij = a->j, *idx; 827 PetscInt mbs, i, j, k, n, *ridx = NULL; 828 PetscBool usecprow = a->compressedrow.use; 829 830 PetscFunctionBegin; 831 PetscCall(VecGetArrayRead(xx, &x)); 832 PetscCall(VecGetArrayWrite(zz, &zarray)); 833 834 v = a->a; 835 if (usecprow) { 836 mbs = a->compressedrow.nrows; 837 ii = a->compressedrow.i; 838 ridx = a->compressedrow.rindex; 839 PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 840 } else { 841 mbs = a->mbs; 842 ii = a->i; 843 z = zarray; 844 } 845 846 for (i = 0; i < mbs; i++) { 847 n = ii[i + 1] - ii[i]; 848 idx = ij + ii[i]; 849 sum1 = 0.0; 850 sum2 = 0.0; 851 sum3 = 0.0; 852 sum4 = 0.0; 853 sum5 = 0.0; 854 sum6 = 0.0; 855 sum7 = 0.0; 856 sum8 = 0.0; 857 sum9 = 0.0; 858 sum10 = 0.0; 859 sum11 = 0.0; 860 861 for (j = 0; j < n; j++) { 862 xb = x + 11 * (idx[j]); 863 864 for (k = 0; k < 11; k++) { 865 xv = xb[k]; 866 sum1 += v[0] * xv; 867 sum2 += v[1] * xv; 868 sum3 += v[2] * xv; 869 sum4 += v[3] * xv; 870 sum5 += v[4] * xv; 871 sum6 += v[5] * xv; 872 sum7 += v[6] * xv; 873 sum8 += v[7] * xv; 874 sum9 += v[8] * xv; 875 sum10 += v[9] * xv; 876 sum11 += v[10] * xv; 877 v += 11; 878 } 879 } 880 if (usecprow) z = zarray + 11 * ridx[i]; 881 z[0] = sum1; 882 z[1] = sum2; 883 z[2] = sum3; 884 z[3] = sum4; 885 z[4] = sum5; 886 z[5] = sum6; 887 z[6] = sum7; 888 z[7] = sum8; 889 z[8] = sum9; 890 z[9] = sum10; 891 z[10] = sum11; 892 893 if (!usecprow) z += 11; 894 } 895 896 PetscCall(VecRestoreArrayRead(xx, &x)); 897 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 898 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 899 PetscFunctionReturn(PETSC_SUCCESS); 900 } 901 902 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 903 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 904 { 905 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 906 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 907 const PetscScalar *x, *xb; 908 PetscScalar *zarray, xv; 909 const MatScalar *v; 910 const PetscInt *ii, *ij = a->j, *idx; 911 PetscInt mbs, i, j, k, n, *ridx = NULL; 912 PetscBool usecprow = a->compressedrow.use; 913 914 PetscFunctionBegin; 915 PetscCall(VecGetArrayRead(xx, &x)); 916 PetscCall(VecGetArrayWrite(zz, &zarray)); 917 918 v = a->a; 919 if (usecprow) { 920 mbs = a->compressedrow.nrows; 921 ii = a->compressedrow.i; 922 ridx = a->compressedrow.rindex; 923 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 924 } else { 925 mbs = a->mbs; 926 ii = a->i; 927 z = zarray; 928 } 929 930 for (i = 0; i < mbs; i++) { 931 n = ii[i + 1] - ii[i]; 932 idx = ij + ii[i]; 933 sum1 = 0.0; 934 sum2 = 0.0; 935 sum3 = 0.0; 936 sum4 = 0.0; 937 sum5 = 0.0; 938 sum6 = 0.0; 939 sum7 = 0.0; 940 sum8 = 0.0; 941 sum9 = 0.0; 942 sum10 = 0.0; 943 sum11 = 0.0; 944 sum12 = 0.0; 945 946 for (j = 0; j < n; j++) { 947 xb = x + 12 * (idx[j]); 948 949 for (k = 0; k < 12; k++) { 950 xv = xb[k]; 951 sum1 += v[0] * xv; 952 sum2 += v[1] * xv; 953 sum3 += v[2] * xv; 954 sum4 += v[3] * xv; 955 sum5 += v[4] * xv; 956 sum6 += v[5] * xv; 957 sum7 += v[6] * xv; 958 sum8 += v[7] * xv; 959 sum9 += v[8] * xv; 960 sum10 += v[9] * xv; 961 sum11 += v[10] * xv; 962 sum12 += v[11] * xv; 963 v += 12; 964 } 965 } 966 if (usecprow) z = zarray + 12 * ridx[i]; 967 z[0] = sum1; 968 z[1] = sum2; 969 z[2] = sum3; 970 z[3] = sum4; 971 z[4] = sum5; 972 z[5] = sum6; 973 z[6] = sum7; 974 z[7] = sum8; 975 z[8] = sum9; 976 z[9] = sum10; 977 z[10] = sum11; 978 z[11] = sum12; 979 if (!usecprow) z += 12; 980 } 981 PetscCall(VecRestoreArrayRead(xx, &x)); 982 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 983 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 988 { 989 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 990 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 991 const PetscScalar *x, *xb; 992 PetscScalar *zarray, *yarray, xv; 993 const MatScalar *v; 994 const PetscInt *ii, *ij = a->j, *idx; 995 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 996 PetscBool usecprow = a->compressedrow.use; 997 998 PetscFunctionBegin; 999 PetscCall(VecGetArrayRead(xx, &x)); 1000 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1001 1002 v = a->a; 1003 if (usecprow) { 1004 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1005 mbs = a->compressedrow.nrows; 1006 ii = a->compressedrow.i; 1007 ridx = a->compressedrow.rindex; 1008 } else { 1009 ii = a->i; 1010 y = yarray; 1011 z = zarray; 1012 } 1013 1014 for (i = 0; i < mbs; i++) { 1015 n = ii[i + 1] - ii[i]; 1016 idx = ij + ii[i]; 1017 1018 if (usecprow) { 1019 y = yarray + 12 * ridx[i]; 1020 z = zarray + 12 * ridx[i]; 1021 } 1022 sum1 = y[0]; 1023 sum2 = y[1]; 1024 sum3 = y[2]; 1025 sum4 = y[3]; 1026 sum5 = y[4]; 1027 sum6 = y[5]; 1028 sum7 = y[6]; 1029 sum8 = y[7]; 1030 sum9 = y[8]; 1031 sum10 = y[9]; 1032 sum11 = y[10]; 1033 sum12 = y[11]; 1034 1035 for (j = 0; j < n; j++) { 1036 xb = x + 12 * (idx[j]); 1037 1038 for (k = 0; k < 12; k++) { 1039 xv = xb[k]; 1040 sum1 += v[0] * xv; 1041 sum2 += v[1] * xv; 1042 sum3 += v[2] * xv; 1043 sum4 += v[3] * xv; 1044 sum5 += v[4] * xv; 1045 sum6 += v[5] * xv; 1046 sum7 += v[6] * xv; 1047 sum8 += v[7] * xv; 1048 sum9 += v[8] * xv; 1049 sum10 += v[9] * xv; 1050 sum11 += v[10] * xv; 1051 sum12 += v[11] * xv; 1052 v += 12; 1053 } 1054 } 1055 1056 z[0] = sum1; 1057 z[1] = sum2; 1058 z[2] = sum3; 1059 z[3] = sum4; 1060 z[4] = sum5; 1061 z[5] = sum6; 1062 z[6] = sum7; 1063 z[7] = sum8; 1064 z[8] = sum9; 1065 z[9] = sum10; 1066 z[10] = sum11; 1067 z[11] = sum12; 1068 if (!usecprow) { 1069 y += 12; 1070 z += 12; 1071 } 1072 } 1073 PetscCall(VecRestoreArrayRead(xx, &x)); 1074 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1075 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1080 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1081 { 1082 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1083 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1084 const PetscScalar *x, *xb; 1085 PetscScalar x1, x2, x3, x4, *zarray; 1086 const MatScalar *v; 1087 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1088 PetscInt mbs, i, j, n; 1089 PetscBool usecprow = a->compressedrow.use; 1090 1091 PetscFunctionBegin; 1092 PetscCall(VecGetArrayRead(xx, &x)); 1093 PetscCall(VecGetArrayWrite(zz, &zarray)); 1094 1095 v = a->a; 1096 if (usecprow) { 1097 mbs = a->compressedrow.nrows; 1098 ii = a->compressedrow.i; 1099 ridx = a->compressedrow.rindex; 1100 PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 1101 } else { 1102 mbs = a->mbs; 1103 ii = a->i; 1104 z = zarray; 1105 } 1106 1107 for (i = 0; i < mbs; i++) { 1108 n = ii[i + 1] - ii[i]; 1109 idx = ij + ii[i]; 1110 1111 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1112 for (j = 0; j < n; j++) { 1113 xb = x + 12 * (idx[j]); 1114 x1 = xb[0]; 1115 x2 = xb[1]; 1116 x3 = xb[2]; 1117 x4 = xb[3]; 1118 1119 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1120 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1121 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1122 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1123 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1124 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1125 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1126 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1127 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1128 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1129 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1130 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1131 v += 48; 1132 1133 x1 = xb[4]; 1134 x2 = xb[5]; 1135 x3 = xb[6]; 1136 x4 = xb[7]; 1137 1138 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1139 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1140 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1141 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1142 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1143 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1144 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1145 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1146 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1147 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1148 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1149 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1150 v += 48; 1151 1152 x1 = xb[8]; 1153 x2 = xb[9]; 1154 x3 = xb[10]; 1155 x4 = xb[11]; 1156 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1157 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1158 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1159 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1160 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1161 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1162 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1163 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1164 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1165 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1166 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1167 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1168 v += 48; 1169 } 1170 if (usecprow) z = zarray + 12 * ridx[i]; 1171 z[0] = sum1; 1172 z[1] = sum2; 1173 z[2] = sum3; 1174 z[3] = sum4; 1175 z[4] = sum5; 1176 z[5] = sum6; 1177 z[6] = sum7; 1178 z[7] = sum8; 1179 z[8] = sum9; 1180 z[9] = sum10; 1181 z[10] = sum11; 1182 z[11] = sum12; 1183 if (!usecprow) z += 12; 1184 } 1185 PetscCall(VecRestoreArrayRead(xx, &x)); 1186 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1187 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1188 PetscFunctionReturn(PETSC_SUCCESS); 1189 } 1190 1191 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1192 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1193 { 1194 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1195 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 1196 const PetscScalar *x, *xb; 1197 PetscScalar x1, x2, x3, x4, *zarray, *yarray; 1198 const MatScalar *v; 1199 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 1200 PetscInt mbs = a->mbs, i, j, n; 1201 PetscBool usecprow = a->compressedrow.use; 1202 1203 PetscFunctionBegin; 1204 PetscCall(VecGetArrayRead(xx, &x)); 1205 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 1206 1207 v = a->a; 1208 if (usecprow) { 1209 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 1210 mbs = a->compressedrow.nrows; 1211 ii = a->compressedrow.i; 1212 ridx = a->compressedrow.rindex; 1213 } else { 1214 ii = a->i; 1215 y = yarray; 1216 z = zarray; 1217 } 1218 1219 for (i = 0; i < mbs; i++) { 1220 n = ii[i + 1] - ii[i]; 1221 idx = ij + ii[i]; 1222 1223 if (usecprow) { 1224 y = yarray + 12 * ridx[i]; 1225 z = zarray + 12 * ridx[i]; 1226 } 1227 sum1 = y[0]; 1228 sum2 = y[1]; 1229 sum3 = y[2]; 1230 sum4 = y[3]; 1231 sum5 = y[4]; 1232 sum6 = y[5]; 1233 sum7 = y[6]; 1234 sum8 = y[7]; 1235 sum9 = y[8]; 1236 sum10 = y[9]; 1237 sum11 = y[10]; 1238 sum12 = y[11]; 1239 1240 for (j = 0; j < n; j++) { 1241 xb = x + 12 * (idx[j]); 1242 x1 = xb[0]; 1243 x2 = xb[1]; 1244 x3 = xb[2]; 1245 x4 = xb[3]; 1246 1247 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1248 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1249 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1250 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1251 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1252 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1253 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1254 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1255 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1256 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1257 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1258 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1259 v += 48; 1260 1261 x1 = xb[4]; 1262 x2 = xb[5]; 1263 x3 = xb[6]; 1264 x4 = xb[7]; 1265 1266 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1267 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1268 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1269 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1270 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1271 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1272 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1273 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1274 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1275 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1276 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1277 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1278 v += 48; 1279 1280 x1 = xb[8]; 1281 x2 = xb[9]; 1282 x3 = xb[10]; 1283 x4 = xb[11]; 1284 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 1285 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 1286 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 1287 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 1288 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 1289 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 1290 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 1291 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 1292 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 1293 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 1294 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 1295 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 1296 v += 48; 1297 } 1298 z[0] = sum1; 1299 z[1] = sum2; 1300 z[2] = sum3; 1301 z[3] = sum4; 1302 z[4] = sum5; 1303 z[5] = sum6; 1304 z[6] = sum7; 1305 z[7] = sum8; 1306 z[8] = sum9; 1307 z[9] = sum10; 1308 z[10] = sum11; 1309 z[11] = sum12; 1310 if (!usecprow) { 1311 y += 12; 1312 z += 12; 1313 } 1314 } 1315 PetscCall(VecRestoreArrayRead(xx, &x)); 1316 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 1317 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1318 PetscFunctionReturn(PETSC_SUCCESS); 1319 } 1320 1321 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1322 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1323 { 1324 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1325 PetscScalar *z = NULL, *zarray; 1326 const PetscScalar *x, *work; 1327 const MatScalar *v = a->a; 1328 PetscInt mbs, i, j, n; 1329 const PetscInt *idx = a->j, *ii, *ridx = NULL; 1330 PetscBool usecprow = a->compressedrow.use; 1331 const PetscInt bs = 12, bs2 = 144; 1332 1333 __m256d a0, a1, a2, a3, a4, a5; 1334 __m256d w0, w1, w2, w3; 1335 __m256d z0, z1, z2; 1336 1337 PetscFunctionBegin; 1338 PetscCall(VecGetArrayRead(xx, &x)); 1339 PetscCall(VecGetArrayWrite(zz, &zarray)); 1340 1341 if (usecprow) { 1342 mbs = a->compressedrow.nrows; 1343 ii = a->compressedrow.i; 1344 ridx = a->compressedrow.rindex; 1345 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 1346 } else { 1347 mbs = a->mbs; 1348 ii = a->i; 1349 z = zarray; 1350 } 1351 1352 for (i = 0; i < mbs; i++) { 1353 z0 = _mm256_setzero_pd(); 1354 z1 = _mm256_setzero_pd(); 1355 z2 = _mm256_setzero_pd(); 1356 1357 n = ii[1] - ii[0]; 1358 ii++; 1359 for (j = 0; j < n; j++) { 1360 work = x + bs * (*idx++); 1361 1362 /* first column of a */ 1363 w0 = _mm256_set1_pd(work[0]); 1364 a0 = _mm256_loadu_pd(v + 0); 1365 z0 = _mm256_fmadd_pd(a0, w0, z0); 1366 a1 = _mm256_loadu_pd(v + 4); 1367 z1 = _mm256_fmadd_pd(a1, w0, z1); 1368 a2 = _mm256_loadu_pd(v + 8); 1369 z2 = _mm256_fmadd_pd(a2, w0, z2); 1370 1371 /* second column of a */ 1372 w1 = _mm256_set1_pd(work[1]); 1373 a3 = _mm256_loadu_pd(v + 12); 1374 z0 = _mm256_fmadd_pd(a3, w1, z0); 1375 a4 = _mm256_loadu_pd(v + 16); 1376 z1 = _mm256_fmadd_pd(a4, w1, z1); 1377 a5 = _mm256_loadu_pd(v + 20); 1378 z2 = _mm256_fmadd_pd(a5, w1, z2); 1379 1380 /* third column of a */ 1381 w2 = _mm256_set1_pd(work[2]); 1382 a0 = _mm256_loadu_pd(v + 24); 1383 z0 = _mm256_fmadd_pd(a0, w2, z0); 1384 a1 = _mm256_loadu_pd(v + 28); 1385 z1 = _mm256_fmadd_pd(a1, w2, z1); 1386 a2 = _mm256_loadu_pd(v + 32); 1387 z2 = _mm256_fmadd_pd(a2, w2, z2); 1388 1389 /* fourth column of a */ 1390 w3 = _mm256_set1_pd(work[3]); 1391 a3 = _mm256_loadu_pd(v + 36); 1392 z0 = _mm256_fmadd_pd(a3, w3, z0); 1393 a4 = _mm256_loadu_pd(v + 40); 1394 z1 = _mm256_fmadd_pd(a4, w3, z1); 1395 a5 = _mm256_loadu_pd(v + 44); 1396 z2 = _mm256_fmadd_pd(a5, w3, z2); 1397 1398 /* fifth column of a */ 1399 w0 = _mm256_set1_pd(work[4]); 1400 a0 = _mm256_loadu_pd(v + 48); 1401 z0 = _mm256_fmadd_pd(a0, w0, z0); 1402 a1 = _mm256_loadu_pd(v + 52); 1403 z1 = _mm256_fmadd_pd(a1, w0, z1); 1404 a2 = _mm256_loadu_pd(v + 56); 1405 z2 = _mm256_fmadd_pd(a2, w0, z2); 1406 1407 /* sixth column of a */ 1408 w1 = _mm256_set1_pd(work[5]); 1409 a3 = _mm256_loadu_pd(v + 60); 1410 z0 = _mm256_fmadd_pd(a3, w1, z0); 1411 a4 = _mm256_loadu_pd(v + 64); 1412 z1 = _mm256_fmadd_pd(a4, w1, z1); 1413 a5 = _mm256_loadu_pd(v + 68); 1414 z2 = _mm256_fmadd_pd(a5, w1, z2); 1415 1416 /* seventh column of a */ 1417 w2 = _mm256_set1_pd(work[6]); 1418 a0 = _mm256_loadu_pd(v + 72); 1419 z0 = _mm256_fmadd_pd(a0, w2, z0); 1420 a1 = _mm256_loadu_pd(v + 76); 1421 z1 = _mm256_fmadd_pd(a1, w2, z1); 1422 a2 = _mm256_loadu_pd(v + 80); 1423 z2 = _mm256_fmadd_pd(a2, w2, z2); 1424 1425 /* eighth column of a */ 1426 w3 = _mm256_set1_pd(work[7]); 1427 a3 = _mm256_loadu_pd(v + 84); 1428 z0 = _mm256_fmadd_pd(a3, w3, z0); 1429 a4 = _mm256_loadu_pd(v + 88); 1430 z1 = _mm256_fmadd_pd(a4, w3, z1); 1431 a5 = _mm256_loadu_pd(v + 92); 1432 z2 = _mm256_fmadd_pd(a5, w3, z2); 1433 1434 /* ninth column of a */ 1435 w0 = _mm256_set1_pd(work[8]); 1436 a0 = _mm256_loadu_pd(v + 96); 1437 z0 = _mm256_fmadd_pd(a0, w0, z0); 1438 a1 = _mm256_loadu_pd(v + 100); 1439 z1 = _mm256_fmadd_pd(a1, w0, z1); 1440 a2 = _mm256_loadu_pd(v + 104); 1441 z2 = _mm256_fmadd_pd(a2, w0, z2); 1442 1443 /* tenth column of a */ 1444 w1 = _mm256_set1_pd(work[9]); 1445 a3 = _mm256_loadu_pd(v + 108); 1446 z0 = _mm256_fmadd_pd(a3, w1, z0); 1447 a4 = _mm256_loadu_pd(v + 112); 1448 z1 = _mm256_fmadd_pd(a4, w1, z1); 1449 a5 = _mm256_loadu_pd(v + 116); 1450 z2 = _mm256_fmadd_pd(a5, w1, z2); 1451 1452 /* eleventh column of a */ 1453 w2 = _mm256_set1_pd(work[10]); 1454 a0 = _mm256_loadu_pd(v + 120); 1455 z0 = _mm256_fmadd_pd(a0, w2, z0); 1456 a1 = _mm256_loadu_pd(v + 124); 1457 z1 = _mm256_fmadd_pd(a1, w2, z1); 1458 a2 = _mm256_loadu_pd(v + 128); 1459 z2 = _mm256_fmadd_pd(a2, w2, z2); 1460 1461 /* twelveth column of a */ 1462 w3 = _mm256_set1_pd(work[11]); 1463 a3 = _mm256_loadu_pd(v + 132); 1464 z0 = _mm256_fmadd_pd(a3, w3, z0); 1465 a4 = _mm256_loadu_pd(v + 136); 1466 z1 = _mm256_fmadd_pd(a4, w3, z1); 1467 a5 = _mm256_loadu_pd(v + 140); 1468 z2 = _mm256_fmadd_pd(a5, w3, z2); 1469 1470 v += bs2; 1471 } 1472 if (usecprow) z = zarray + bs * ridx[i]; 1473 _mm256_storeu_pd(&z[0], z0); 1474 _mm256_storeu_pd(&z[4], z1); 1475 _mm256_storeu_pd(&z[8], z2); 1476 if (!usecprow) z += bs; 1477 } 1478 PetscCall(VecRestoreArrayRead(xx, &x)); 1479 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1480 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1481 PetscFunctionReturn(PETSC_SUCCESS); 1482 } 1483 #endif 1484 1485 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1486 /* Default MatMult for block size 15 */ 1487 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1488 { 1489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1490 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1491 const PetscScalar *x, *xb; 1492 PetscScalar *zarray, xv; 1493 const MatScalar *v; 1494 const PetscInt *ii, *ij = a->j, *idx; 1495 PetscInt mbs, i, j, k, n, *ridx = NULL; 1496 PetscBool usecprow = a->compressedrow.use; 1497 1498 PetscFunctionBegin; 1499 PetscCall(VecGetArrayRead(xx, &x)); 1500 PetscCall(VecGetArrayWrite(zz, &zarray)); 1501 1502 v = a->a; 1503 if (usecprow) { 1504 mbs = a->compressedrow.nrows; 1505 ii = a->compressedrow.i; 1506 ridx = a->compressedrow.rindex; 1507 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1508 } else { 1509 mbs = a->mbs; 1510 ii = a->i; 1511 z = zarray; 1512 } 1513 1514 for (i = 0; i < mbs; i++) { 1515 n = ii[i + 1] - ii[i]; 1516 idx = ij + ii[i]; 1517 sum1 = 0.0; 1518 sum2 = 0.0; 1519 sum3 = 0.0; 1520 sum4 = 0.0; 1521 sum5 = 0.0; 1522 sum6 = 0.0; 1523 sum7 = 0.0; 1524 sum8 = 0.0; 1525 sum9 = 0.0; 1526 sum10 = 0.0; 1527 sum11 = 0.0; 1528 sum12 = 0.0; 1529 sum13 = 0.0; 1530 sum14 = 0.0; 1531 sum15 = 0.0; 1532 1533 for (j = 0; j < n; j++) { 1534 xb = x + 15 * (idx[j]); 1535 1536 for (k = 0; k < 15; k++) { 1537 xv = xb[k]; 1538 sum1 += v[0] * xv; 1539 sum2 += v[1] * xv; 1540 sum3 += v[2] * xv; 1541 sum4 += v[3] * xv; 1542 sum5 += v[4] * xv; 1543 sum6 += v[5] * xv; 1544 sum7 += v[6] * xv; 1545 sum8 += v[7] * xv; 1546 sum9 += v[8] * xv; 1547 sum10 += v[9] * xv; 1548 sum11 += v[10] * xv; 1549 sum12 += v[11] * xv; 1550 sum13 += v[12] * xv; 1551 sum14 += v[13] * xv; 1552 sum15 += v[14] * xv; 1553 v += 15; 1554 } 1555 } 1556 if (usecprow) z = zarray + 15 * ridx[i]; 1557 z[0] = sum1; 1558 z[1] = sum2; 1559 z[2] = sum3; 1560 z[3] = sum4; 1561 z[4] = sum5; 1562 z[5] = sum6; 1563 z[6] = sum7; 1564 z[7] = sum8; 1565 z[8] = sum9; 1566 z[9] = sum10; 1567 z[10] = sum11; 1568 z[11] = sum12; 1569 z[12] = sum13; 1570 z[13] = sum14; 1571 z[14] = sum15; 1572 1573 if (!usecprow) z += 15; 1574 } 1575 1576 PetscCall(VecRestoreArrayRead(xx, &x)); 1577 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1578 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1579 PetscFunctionReturn(PETSC_SUCCESS); 1580 } 1581 1582 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1583 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1584 { 1585 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1586 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1587 const PetscScalar *x, *xb; 1588 PetscScalar x1, x2, x3, x4, *zarray; 1589 const MatScalar *v; 1590 const PetscInt *ii, *ij = a->j, *idx; 1591 PetscInt mbs, i, j, n, *ridx = NULL; 1592 PetscBool usecprow = a->compressedrow.use; 1593 1594 PetscFunctionBegin; 1595 PetscCall(VecGetArrayRead(xx, &x)); 1596 PetscCall(VecGetArrayWrite(zz, &zarray)); 1597 1598 v = a->a; 1599 if (usecprow) { 1600 mbs = a->compressedrow.nrows; 1601 ii = a->compressedrow.i; 1602 ridx = a->compressedrow.rindex; 1603 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1604 } else { 1605 mbs = a->mbs; 1606 ii = a->i; 1607 z = zarray; 1608 } 1609 1610 for (i = 0; i < mbs; i++) { 1611 n = ii[i + 1] - ii[i]; 1612 idx = ij + ii[i]; 1613 sum1 = 0.0; 1614 sum2 = 0.0; 1615 sum3 = 0.0; 1616 sum4 = 0.0; 1617 sum5 = 0.0; 1618 sum6 = 0.0; 1619 sum7 = 0.0; 1620 sum8 = 0.0; 1621 sum9 = 0.0; 1622 sum10 = 0.0; 1623 sum11 = 0.0; 1624 sum12 = 0.0; 1625 sum13 = 0.0; 1626 sum14 = 0.0; 1627 sum15 = 0.0; 1628 1629 for (j = 0; j < n; j++) { 1630 xb = x + 15 * (idx[j]); 1631 x1 = xb[0]; 1632 x2 = xb[1]; 1633 x3 = xb[2]; 1634 x4 = xb[3]; 1635 1636 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1637 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1638 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1639 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1640 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1641 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1642 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1643 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1644 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1645 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1646 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1647 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1648 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1649 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1650 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1651 1652 v += 60; 1653 1654 x1 = xb[4]; 1655 x2 = xb[5]; 1656 x3 = xb[6]; 1657 x4 = xb[7]; 1658 1659 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1660 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1661 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1662 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1663 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1664 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1665 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1666 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1667 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1668 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1669 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1670 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1671 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1672 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1673 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1674 v += 60; 1675 1676 x1 = xb[8]; 1677 x2 = xb[9]; 1678 x3 = xb[10]; 1679 x4 = xb[11]; 1680 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 1681 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 1682 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 1683 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 1684 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 1685 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 1686 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 1687 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 1688 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 1689 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 1690 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 1691 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 1692 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 1693 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 1694 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 1695 v += 60; 1696 1697 x1 = xb[12]; 1698 x2 = xb[13]; 1699 x3 = xb[14]; 1700 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 1701 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 1702 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 1703 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 1704 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 1705 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 1706 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 1707 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 1708 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 1709 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 1710 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 1711 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 1712 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 1713 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 1714 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 1715 v += 45; 1716 } 1717 if (usecprow) z = zarray + 15 * ridx[i]; 1718 z[0] = sum1; 1719 z[1] = sum2; 1720 z[2] = sum3; 1721 z[3] = sum4; 1722 z[4] = sum5; 1723 z[5] = sum6; 1724 z[6] = sum7; 1725 z[7] = sum8; 1726 z[8] = sum9; 1727 z[9] = sum10; 1728 z[10] = sum11; 1729 z[11] = sum12; 1730 z[12] = sum13; 1731 z[13] = sum14; 1732 z[14] = sum15; 1733 1734 if (!usecprow) z += 15; 1735 } 1736 1737 PetscCall(VecRestoreArrayRead(xx, &x)); 1738 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1739 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1740 PetscFunctionReturn(PETSC_SUCCESS); 1741 } 1742 1743 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1744 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1745 { 1746 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1747 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1748 const PetscScalar *x, *xb; 1749 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 1750 const MatScalar *v; 1751 const PetscInt *ii, *ij = a->j, *idx; 1752 PetscInt mbs, i, j, n, *ridx = NULL; 1753 PetscBool usecprow = a->compressedrow.use; 1754 1755 PetscFunctionBegin; 1756 PetscCall(VecGetArrayRead(xx, &x)); 1757 PetscCall(VecGetArrayWrite(zz, &zarray)); 1758 1759 v = a->a; 1760 if (usecprow) { 1761 mbs = a->compressedrow.nrows; 1762 ii = a->compressedrow.i; 1763 ridx = a->compressedrow.rindex; 1764 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1765 } else { 1766 mbs = a->mbs; 1767 ii = a->i; 1768 z = zarray; 1769 } 1770 1771 for (i = 0; i < mbs; i++) { 1772 n = ii[i + 1] - ii[i]; 1773 idx = ij + ii[i]; 1774 sum1 = 0.0; 1775 sum2 = 0.0; 1776 sum3 = 0.0; 1777 sum4 = 0.0; 1778 sum5 = 0.0; 1779 sum6 = 0.0; 1780 sum7 = 0.0; 1781 sum8 = 0.0; 1782 sum9 = 0.0; 1783 sum10 = 0.0; 1784 sum11 = 0.0; 1785 sum12 = 0.0; 1786 sum13 = 0.0; 1787 sum14 = 0.0; 1788 sum15 = 0.0; 1789 1790 for (j = 0; j < n; j++) { 1791 xb = x + 15 * (idx[j]); 1792 x1 = xb[0]; 1793 x2 = xb[1]; 1794 x3 = xb[2]; 1795 x4 = xb[3]; 1796 x5 = xb[4]; 1797 x6 = xb[5]; 1798 x7 = xb[6]; 1799 x8 = xb[7]; 1800 1801 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 1802 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 1803 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 1804 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 1805 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 1806 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 1807 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 1808 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 1809 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 1810 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 1811 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 1812 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 1813 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 1814 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 1815 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 1816 v += 120; 1817 1818 x1 = xb[8]; 1819 x2 = xb[9]; 1820 x3 = xb[10]; 1821 x4 = xb[11]; 1822 x5 = xb[12]; 1823 x6 = xb[13]; 1824 x7 = xb[14]; 1825 1826 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 1827 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 1828 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 1829 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 1830 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 1831 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 1832 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 1833 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 1834 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 1835 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 1836 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 1837 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 1838 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 1839 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 1840 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 1841 v += 105; 1842 } 1843 if (usecprow) z = zarray + 15 * ridx[i]; 1844 z[0] = sum1; 1845 z[1] = sum2; 1846 z[2] = sum3; 1847 z[3] = sum4; 1848 z[4] = sum5; 1849 z[5] = sum6; 1850 z[6] = sum7; 1851 z[7] = sum8; 1852 z[8] = sum9; 1853 z[9] = sum10; 1854 z[10] = sum11; 1855 z[11] = sum12; 1856 z[12] = sum13; 1857 z[13] = sum14; 1858 z[14] = sum15; 1859 1860 if (!usecprow) z += 15; 1861 } 1862 1863 PetscCall(VecRestoreArrayRead(xx, &x)); 1864 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1865 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1866 PetscFunctionReturn(PETSC_SUCCESS); 1867 } 1868 1869 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1870 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1871 { 1872 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1873 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 1874 const PetscScalar *x, *xb; 1875 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 1876 const MatScalar *v; 1877 const PetscInt *ii, *ij = a->j, *idx; 1878 PetscInt mbs, i, j, n, *ridx = NULL; 1879 PetscBool usecprow = a->compressedrow.use; 1880 1881 PetscFunctionBegin; 1882 PetscCall(VecGetArrayRead(xx, &x)); 1883 PetscCall(VecGetArrayWrite(zz, &zarray)); 1884 1885 v = a->a; 1886 if (usecprow) { 1887 mbs = a->compressedrow.nrows; 1888 ii = a->compressedrow.i; 1889 ridx = a->compressedrow.rindex; 1890 PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 1891 } else { 1892 mbs = a->mbs; 1893 ii = a->i; 1894 z = zarray; 1895 } 1896 1897 for (i = 0; i < mbs; i++) { 1898 n = ii[i + 1] - ii[i]; 1899 idx = ij + ii[i]; 1900 sum1 = 0.0; 1901 sum2 = 0.0; 1902 sum3 = 0.0; 1903 sum4 = 0.0; 1904 sum5 = 0.0; 1905 sum6 = 0.0; 1906 sum7 = 0.0; 1907 sum8 = 0.0; 1908 sum9 = 0.0; 1909 sum10 = 0.0; 1910 sum11 = 0.0; 1911 sum12 = 0.0; 1912 sum13 = 0.0; 1913 sum14 = 0.0; 1914 sum15 = 0.0; 1915 1916 for (j = 0; j < n; j++) { 1917 xb = x + 15 * (idx[j]); 1918 x1 = xb[0]; 1919 x2 = xb[1]; 1920 x3 = xb[2]; 1921 x4 = xb[3]; 1922 x5 = xb[4]; 1923 x6 = xb[5]; 1924 x7 = xb[6]; 1925 x8 = xb[7]; 1926 x9 = xb[8]; 1927 x10 = xb[9]; 1928 x11 = xb[10]; 1929 x12 = xb[11]; 1930 x13 = xb[12]; 1931 x14 = xb[13]; 1932 x15 = xb[14]; 1933 1934 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 1935 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 1936 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 1937 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 1938 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 1939 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 1940 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 1941 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 1942 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 1943 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 1944 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 1945 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 1946 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 1947 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 1948 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 1949 v += 225; 1950 } 1951 if (usecprow) z = zarray + 15 * ridx[i]; 1952 z[0] = sum1; 1953 z[1] = sum2; 1954 z[2] = sum3; 1955 z[3] = sum4; 1956 z[4] = sum5; 1957 z[5] = sum6; 1958 z[6] = sum7; 1959 z[7] = sum8; 1960 z[8] = sum9; 1961 z[9] = sum10; 1962 z[10] = sum11; 1963 z[11] = sum12; 1964 z[12] = sum13; 1965 z[13] = sum14; 1966 z[14] = sum15; 1967 1968 if (!usecprow) z += 15; 1969 } 1970 1971 PetscCall(VecRestoreArrayRead(xx, &x)); 1972 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 1973 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1974 PetscFunctionReturn(PETSC_SUCCESS); 1975 } 1976 1977 /* 1978 This will not work with MatScalar == float because it calls the BLAS 1979 */ 1980 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 1981 { 1982 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1983 PetscScalar *z = NULL, *work, *workt, *zarray; 1984 const PetscScalar *x, *xb; 1985 const MatScalar *v; 1986 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1987 const PetscInt *idx, *ii, *ridx = NULL; 1988 PetscInt ncols, k; 1989 PetscBool usecprow = a->compressedrow.use; 1990 1991 PetscFunctionBegin; 1992 PetscCall(VecGetArrayRead(xx, &x)); 1993 PetscCall(VecGetArrayWrite(zz, &zarray)); 1994 1995 idx = a->j; 1996 v = a->a; 1997 if (usecprow) { 1998 mbs = a->compressedrow.nrows; 1999 ii = a->compressedrow.i; 2000 ridx = a->compressedrow.rindex; 2001 PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 2002 } else { 2003 mbs = a->mbs; 2004 ii = a->i; 2005 z = zarray; 2006 } 2007 2008 if (!a->mult_work) { 2009 k = PetscMax(A->rmap->n, A->cmap->n); 2010 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2011 } 2012 work = a->mult_work; 2013 for (i = 0; i < mbs; i++) { 2014 n = ii[1] - ii[0]; 2015 ii++; 2016 ncols = n * bs; 2017 workt = work; 2018 for (j = 0; j < n; j++) { 2019 xb = x + bs * (*idx++); 2020 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2021 workt += bs; 2022 } 2023 if (usecprow) z = zarray + bs * ridx[i]; 2024 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 2025 v += n * bs2; 2026 if (!usecprow) z += bs; 2027 } 2028 PetscCall(VecRestoreArrayRead(xx, &x)); 2029 PetscCall(VecRestoreArrayWrite(zz, &zarray)); 2030 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2031 PetscFunctionReturn(PETSC_SUCCESS); 2032 } 2033 2034 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2035 { 2036 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2037 const PetscScalar *x; 2038 PetscScalar *y, *z, sum; 2039 const MatScalar *v; 2040 PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2041 const PetscInt *idx, *ii; 2042 PetscBool usecprow = a->compressedrow.use; 2043 2044 PetscFunctionBegin; 2045 PetscCall(VecGetArrayRead(xx, &x)); 2046 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 2047 2048 idx = a->j; 2049 v = a->a; 2050 if (usecprow) { 2051 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 2052 mbs = a->compressedrow.nrows; 2053 ii = a->compressedrow.i; 2054 ridx = a->compressedrow.rindex; 2055 } else { 2056 ii = a->i; 2057 } 2058 2059 for (i = 0; i < mbs; i++) { 2060 n = ii[1] - ii[0]; 2061 ii++; 2062 if (!usecprow) { 2063 sum = y[i]; 2064 } else { 2065 sum = y[ridx[i]]; 2066 } 2067 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2068 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2069 PetscSparseDensePlusDot(sum, x, v, idx, n); 2070 v += n; 2071 idx += n; 2072 if (usecprow) { 2073 z[ridx[i]] = sum; 2074 } else { 2075 z[i] = sum; 2076 } 2077 } 2078 PetscCall(VecRestoreArrayRead(xx, &x)); 2079 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 2080 PetscCall(PetscLogFlops(2.0 * a->nz)); 2081 PetscFunctionReturn(PETSC_SUCCESS); 2082 } 2083 2084 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2085 { 2086 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2087 PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2088 const PetscScalar *x, *xb; 2089 PetscScalar x1, x2, *yarray, *zarray; 2090 const MatScalar *v; 2091 PetscInt mbs = a->mbs, i, n, j; 2092 const PetscInt *idx, *ii, *ridx = NULL; 2093 PetscBool usecprow = a->compressedrow.use; 2094 2095 PetscFunctionBegin; 2096 PetscCall(VecGetArrayRead(xx, &x)); 2097 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2098 2099 idx = a->j; 2100 v = a->a; 2101 if (usecprow) { 2102 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 2103 mbs = a->compressedrow.nrows; 2104 ii = a->compressedrow.i; 2105 ridx = a->compressedrow.rindex; 2106 } else { 2107 ii = a->i; 2108 y = yarray; 2109 z = zarray; 2110 } 2111 2112 for (i = 0; i < mbs; i++) { 2113 n = ii[1] - ii[0]; 2114 ii++; 2115 if (usecprow) { 2116 z = zarray + 2 * ridx[i]; 2117 y = yarray + 2 * ridx[i]; 2118 } 2119 sum1 = y[0]; 2120 sum2 = y[1]; 2121 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2122 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2123 for (j = 0; j < n; j++) { 2124 xb = x + 2 * (*idx++); 2125 x1 = xb[0]; 2126 x2 = xb[1]; 2127 2128 sum1 += v[0] * x1 + v[2] * x2; 2129 sum2 += v[1] * x1 + v[3] * x2; 2130 v += 4; 2131 } 2132 z[0] = sum1; 2133 z[1] = sum2; 2134 if (!usecprow) { 2135 z += 2; 2136 y += 2; 2137 } 2138 } 2139 PetscCall(VecRestoreArrayRead(xx, &x)); 2140 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2141 PetscCall(PetscLogFlops(4.0 * a->nz)); 2142 PetscFunctionReturn(PETSC_SUCCESS); 2143 } 2144 2145 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2146 { 2147 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2148 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2149 const PetscScalar *x, *xb; 2150 const MatScalar *v; 2151 PetscInt mbs = a->mbs, i, j, n; 2152 const PetscInt *idx, *ii, *ridx = NULL; 2153 PetscBool usecprow = a->compressedrow.use; 2154 2155 PetscFunctionBegin; 2156 PetscCall(VecGetArrayRead(xx, &x)); 2157 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2158 2159 idx = a->j; 2160 v = a->a; 2161 if (usecprow) { 2162 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 2163 mbs = a->compressedrow.nrows; 2164 ii = a->compressedrow.i; 2165 ridx = a->compressedrow.rindex; 2166 } else { 2167 ii = a->i; 2168 y = yarray; 2169 z = zarray; 2170 } 2171 2172 for (i = 0; i < mbs; i++) { 2173 n = ii[1] - ii[0]; 2174 ii++; 2175 if (usecprow) { 2176 z = zarray + 3 * ridx[i]; 2177 y = yarray + 3 * ridx[i]; 2178 } 2179 sum1 = y[0]; 2180 sum2 = y[1]; 2181 sum3 = y[2]; 2182 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2183 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2184 for (j = 0; j < n; j++) { 2185 xb = x + 3 * (*idx++); 2186 x1 = xb[0]; 2187 x2 = xb[1]; 2188 x3 = xb[2]; 2189 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 2190 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 2191 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 2192 v += 9; 2193 } 2194 z[0] = sum1; 2195 z[1] = sum2; 2196 z[2] = sum3; 2197 if (!usecprow) { 2198 z += 3; 2199 y += 3; 2200 } 2201 } 2202 PetscCall(VecRestoreArrayRead(xx, &x)); 2203 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2204 PetscCall(PetscLogFlops(18.0 * a->nz)); 2205 PetscFunctionReturn(PETSC_SUCCESS); 2206 } 2207 2208 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2209 { 2210 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2211 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2212 const PetscScalar *x, *xb; 2213 const MatScalar *v; 2214 PetscInt mbs = a->mbs, i, j, n; 2215 const PetscInt *idx, *ii, *ridx = NULL; 2216 PetscBool usecprow = a->compressedrow.use; 2217 2218 PetscFunctionBegin; 2219 PetscCall(VecGetArrayRead(xx, &x)); 2220 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2221 2222 idx = a->j; 2223 v = a->a; 2224 if (usecprow) { 2225 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 2226 mbs = a->compressedrow.nrows; 2227 ii = a->compressedrow.i; 2228 ridx = a->compressedrow.rindex; 2229 } else { 2230 ii = a->i; 2231 y = yarray; 2232 z = zarray; 2233 } 2234 2235 for (i = 0; i < mbs; i++) { 2236 n = ii[1] - ii[0]; 2237 ii++; 2238 if (usecprow) { 2239 z = zarray + 4 * ridx[i]; 2240 y = yarray + 4 * ridx[i]; 2241 } 2242 sum1 = y[0]; 2243 sum2 = y[1]; 2244 sum3 = y[2]; 2245 sum4 = y[3]; 2246 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2247 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2248 for (j = 0; j < n; j++) { 2249 xb = x + 4 * (*idx++); 2250 x1 = xb[0]; 2251 x2 = xb[1]; 2252 x3 = xb[2]; 2253 x4 = xb[3]; 2254 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 2255 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 2256 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 2257 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 2258 v += 16; 2259 } 2260 z[0] = sum1; 2261 z[1] = sum2; 2262 z[2] = sum3; 2263 z[3] = sum4; 2264 if (!usecprow) { 2265 z += 4; 2266 y += 4; 2267 } 2268 } 2269 PetscCall(VecRestoreArrayRead(xx, &x)); 2270 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2271 PetscCall(PetscLogFlops(32.0 * a->nz)); 2272 PetscFunctionReturn(PETSC_SUCCESS); 2273 } 2274 2275 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2276 { 2277 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2278 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2279 const PetscScalar *x, *xb; 2280 PetscScalar *yarray, *zarray; 2281 const MatScalar *v; 2282 PetscInt mbs = a->mbs, i, j, n; 2283 const PetscInt *idx, *ii, *ridx = NULL; 2284 PetscBool usecprow = a->compressedrow.use; 2285 2286 PetscFunctionBegin; 2287 PetscCall(VecGetArrayRead(xx, &x)); 2288 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2289 2290 idx = a->j; 2291 v = a->a; 2292 if (usecprow) { 2293 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 2294 mbs = a->compressedrow.nrows; 2295 ii = a->compressedrow.i; 2296 ridx = a->compressedrow.rindex; 2297 } else { 2298 ii = a->i; 2299 y = yarray; 2300 z = zarray; 2301 } 2302 2303 for (i = 0; i < mbs; i++) { 2304 n = ii[1] - ii[0]; 2305 ii++; 2306 if (usecprow) { 2307 z = zarray + 5 * ridx[i]; 2308 y = yarray + 5 * ridx[i]; 2309 } 2310 sum1 = y[0]; 2311 sum2 = y[1]; 2312 sum3 = y[2]; 2313 sum4 = y[3]; 2314 sum5 = y[4]; 2315 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2316 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2317 for (j = 0; j < n; j++) { 2318 xb = x + 5 * (*idx++); 2319 x1 = xb[0]; 2320 x2 = xb[1]; 2321 x3 = xb[2]; 2322 x4 = xb[3]; 2323 x5 = xb[4]; 2324 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 2325 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 2326 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 2327 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 2328 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 2329 v += 25; 2330 } 2331 z[0] = sum1; 2332 z[1] = sum2; 2333 z[2] = sum3; 2334 z[3] = sum4; 2335 z[4] = sum5; 2336 if (!usecprow) { 2337 z += 5; 2338 y += 5; 2339 } 2340 } 2341 PetscCall(VecRestoreArrayRead(xx, &x)); 2342 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2343 PetscCall(PetscLogFlops(50.0 * a->nz)); 2344 PetscFunctionReturn(PETSC_SUCCESS); 2345 } 2346 2347 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2348 { 2349 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2350 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2351 const PetscScalar *x, *xb; 2352 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2353 const MatScalar *v; 2354 PetscInt mbs = a->mbs, i, j, n; 2355 const PetscInt *idx, *ii, *ridx = NULL; 2356 PetscBool usecprow = a->compressedrow.use; 2357 2358 PetscFunctionBegin; 2359 PetscCall(VecGetArrayRead(xx, &x)); 2360 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2361 2362 idx = a->j; 2363 v = a->a; 2364 if (usecprow) { 2365 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 2366 mbs = a->compressedrow.nrows; 2367 ii = a->compressedrow.i; 2368 ridx = a->compressedrow.rindex; 2369 } else { 2370 ii = a->i; 2371 y = yarray; 2372 z = zarray; 2373 } 2374 2375 for (i = 0; i < mbs; i++) { 2376 n = ii[1] - ii[0]; 2377 ii++; 2378 if (usecprow) { 2379 z = zarray + 6 * ridx[i]; 2380 y = yarray + 6 * ridx[i]; 2381 } 2382 sum1 = y[0]; 2383 sum2 = y[1]; 2384 sum3 = y[2]; 2385 sum4 = y[3]; 2386 sum5 = y[4]; 2387 sum6 = y[5]; 2388 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2389 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2390 for (j = 0; j < n; j++) { 2391 xb = x + 6 * (*idx++); 2392 x1 = xb[0]; 2393 x2 = xb[1]; 2394 x3 = xb[2]; 2395 x4 = xb[3]; 2396 x5 = xb[4]; 2397 x6 = xb[5]; 2398 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 2399 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 2400 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 2401 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 2402 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 2403 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 2404 v += 36; 2405 } 2406 z[0] = sum1; 2407 z[1] = sum2; 2408 z[2] = sum3; 2409 z[3] = sum4; 2410 z[4] = sum5; 2411 z[5] = sum6; 2412 if (!usecprow) { 2413 z += 6; 2414 y += 6; 2415 } 2416 } 2417 PetscCall(VecRestoreArrayRead(xx, &x)); 2418 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2419 PetscCall(PetscLogFlops(72.0 * a->nz)); 2420 PetscFunctionReturn(PETSC_SUCCESS); 2421 } 2422 2423 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2424 { 2425 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2426 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2427 const PetscScalar *x, *xb; 2428 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2429 const MatScalar *v; 2430 PetscInt mbs = a->mbs, i, j, n; 2431 const PetscInt *idx, *ii, *ridx = NULL; 2432 PetscBool usecprow = a->compressedrow.use; 2433 2434 PetscFunctionBegin; 2435 PetscCall(VecGetArrayRead(xx, &x)); 2436 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2437 2438 idx = a->j; 2439 v = a->a; 2440 if (usecprow) { 2441 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2442 mbs = a->compressedrow.nrows; 2443 ii = a->compressedrow.i; 2444 ridx = a->compressedrow.rindex; 2445 } else { 2446 ii = a->i; 2447 y = yarray; 2448 z = zarray; 2449 } 2450 2451 for (i = 0; i < mbs; i++) { 2452 n = ii[1] - ii[0]; 2453 ii++; 2454 if (usecprow) { 2455 z = zarray + 7 * ridx[i]; 2456 y = yarray + 7 * ridx[i]; 2457 } 2458 sum1 = y[0]; 2459 sum2 = y[1]; 2460 sum3 = y[2]; 2461 sum4 = y[3]; 2462 sum5 = y[4]; 2463 sum6 = y[5]; 2464 sum7 = y[6]; 2465 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2466 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2467 for (j = 0; j < n; j++) { 2468 xb = x + 7 * (*idx++); 2469 x1 = xb[0]; 2470 x2 = xb[1]; 2471 x3 = xb[2]; 2472 x4 = xb[3]; 2473 x5 = xb[4]; 2474 x6 = xb[5]; 2475 x7 = xb[6]; 2476 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 2477 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 2478 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 2479 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 2480 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 2481 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 2482 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 2483 v += 49; 2484 } 2485 z[0] = sum1; 2486 z[1] = sum2; 2487 z[2] = sum3; 2488 z[3] = sum4; 2489 z[4] = sum5; 2490 z[5] = sum6; 2491 z[6] = sum7; 2492 if (!usecprow) { 2493 z += 7; 2494 y += 7; 2495 } 2496 } 2497 PetscCall(VecRestoreArrayRead(xx, &x)); 2498 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2499 PetscCall(PetscLogFlops(98.0 * a->nz)); 2500 PetscFunctionReturn(PETSC_SUCCESS); 2501 } 2502 2503 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2504 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2505 { 2506 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2507 PetscScalar *z = NULL, *work, *workt, *zarray; 2508 const PetscScalar *x, *xb; 2509 const MatScalar *v; 2510 PetscInt mbs, i, j, n; 2511 PetscInt k; 2512 PetscBool usecprow = a->compressedrow.use; 2513 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 2514 2515 __m256d a0, a1, a2, a3, a4, a5; 2516 __m256d w0, w1, w2, w3; 2517 __m256d z0, z1, z2; 2518 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 2519 2520 PetscFunctionBegin; 2521 PetscCall(VecCopy(yy, zz)); 2522 PetscCall(VecGetArrayRead(xx, &x)); 2523 PetscCall(VecGetArray(zz, &zarray)); 2524 2525 idx = a->j; 2526 v = a->a; 2527 if (usecprow) { 2528 mbs = a->compressedrow.nrows; 2529 ii = a->compressedrow.i; 2530 ridx = a->compressedrow.rindex; 2531 } else { 2532 mbs = a->mbs; 2533 ii = a->i; 2534 z = zarray; 2535 } 2536 2537 if (!a->mult_work) { 2538 k = PetscMax(A->rmap->n, A->cmap->n); 2539 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2540 } 2541 2542 work = a->mult_work; 2543 for (i = 0; i < mbs; i++) { 2544 n = ii[1] - ii[0]; 2545 ii++; 2546 workt = work; 2547 for (j = 0; j < n; j++) { 2548 xb = x + bs * (*idx++); 2549 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2550 workt += bs; 2551 } 2552 if (usecprow) z = zarray + bs * ridx[i]; 2553 2554 z0 = _mm256_loadu_pd(&z[0]); 2555 z1 = _mm256_loadu_pd(&z[4]); 2556 z2 = _mm256_set1_pd(z[8]); 2557 2558 for (j = 0; j < n; j++) { 2559 /* first column of a */ 2560 w0 = _mm256_set1_pd(work[j * 9]); 2561 a0 = _mm256_loadu_pd(&v[j * 81]); 2562 z0 = _mm256_fmadd_pd(a0, w0, z0); 2563 a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 2564 z1 = _mm256_fmadd_pd(a1, w0, z1); 2565 a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 2566 z2 = _mm256_fmadd_pd(a2, w0, z2); 2567 2568 /* second column of a */ 2569 w1 = _mm256_set1_pd(work[j * 9 + 1]); 2570 a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 2571 z0 = _mm256_fmadd_pd(a0, w1, z0); 2572 a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 2573 z1 = _mm256_fmadd_pd(a1, w1, z1); 2574 a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 2575 z2 = _mm256_fmadd_pd(a2, w1, z2); 2576 2577 /* third column of a */ 2578 w2 = _mm256_set1_pd(work[j * 9 + 2]); 2579 a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 2580 z0 = _mm256_fmadd_pd(a3, w2, z0); 2581 a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 2582 z1 = _mm256_fmadd_pd(a4, w2, z1); 2583 a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 2584 z2 = _mm256_fmadd_pd(a5, w2, z2); 2585 2586 /* fourth column of a */ 2587 w3 = _mm256_set1_pd(work[j * 9 + 3]); 2588 a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 2589 z0 = _mm256_fmadd_pd(a0, w3, z0); 2590 a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 2591 z1 = _mm256_fmadd_pd(a1, w3, z1); 2592 a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 2593 z2 = _mm256_fmadd_pd(a2, w3, z2); 2594 2595 /* fifth column of a */ 2596 w0 = _mm256_set1_pd(work[j * 9 + 4]); 2597 a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 2598 z0 = _mm256_fmadd_pd(a3, w0, z0); 2599 a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 2600 z1 = _mm256_fmadd_pd(a4, w0, z1); 2601 a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 2602 z2 = _mm256_fmadd_pd(a5, w0, z2); 2603 2604 /* sixth column of a */ 2605 w1 = _mm256_set1_pd(work[j * 9 + 5]); 2606 a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 2607 z0 = _mm256_fmadd_pd(a0, w1, z0); 2608 a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 2609 z1 = _mm256_fmadd_pd(a1, w1, z1); 2610 a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 2611 z2 = _mm256_fmadd_pd(a2, w1, z2); 2612 2613 /* seventh column of a */ 2614 w2 = _mm256_set1_pd(work[j * 9 + 6]); 2615 a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 2616 z0 = _mm256_fmadd_pd(a0, w2, z0); 2617 a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 2618 z1 = _mm256_fmadd_pd(a1, w2, z1); 2619 a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 2620 z2 = _mm256_fmadd_pd(a2, w2, z2); 2621 2622 /* eighth column of a */ 2623 w3 = _mm256_set1_pd(work[j * 9 + 7]); 2624 a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 2625 z0 = _mm256_fmadd_pd(a3, w3, z0); 2626 a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 2627 z1 = _mm256_fmadd_pd(a4, w3, z1); 2628 a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 2629 z2 = _mm256_fmadd_pd(a5, w3, z2); 2630 2631 /* ninth column of a */ 2632 w0 = _mm256_set1_pd(work[j * 9 + 8]); 2633 a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 2634 z0 = _mm256_fmadd_pd(a0, w0, z0); 2635 a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 2636 z1 = _mm256_fmadd_pd(a1, w0, z1); 2637 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 2638 z2 = _mm256_fmadd_pd(a2, w0, z2); 2639 } 2640 2641 _mm256_storeu_pd(&z[0], z0); 2642 _mm256_storeu_pd(&z[4], z1); 2643 _mm256_maskstore_pd(&z[8], mask1, z2); 2644 2645 v += n * bs2; 2646 if (!usecprow) z += bs; 2647 } 2648 PetscCall(VecRestoreArrayRead(xx, &x)); 2649 PetscCall(VecRestoreArray(zz, &zarray)); 2650 PetscCall(PetscLogFlops(162.0 * a->nz)); 2651 PetscFunctionReturn(PETSC_SUCCESS); 2652 } 2653 #endif 2654 2655 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2656 { 2657 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2658 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2659 const PetscScalar *x, *xb; 2660 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2661 const MatScalar *v; 2662 PetscInt mbs = a->mbs, i, j, n; 2663 const PetscInt *idx, *ii, *ridx = NULL; 2664 PetscBool usecprow = a->compressedrow.use; 2665 2666 PetscFunctionBegin; 2667 PetscCall(VecGetArrayRead(xx, &x)); 2668 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2669 2670 idx = a->j; 2671 v = a->a; 2672 if (usecprow) { 2673 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2674 mbs = a->compressedrow.nrows; 2675 ii = a->compressedrow.i; 2676 ridx = a->compressedrow.rindex; 2677 } else { 2678 ii = a->i; 2679 y = yarray; 2680 z = zarray; 2681 } 2682 2683 for (i = 0; i < mbs; i++) { 2684 n = ii[1] - ii[0]; 2685 ii++; 2686 if (usecprow) { 2687 z = zarray + 11 * ridx[i]; 2688 y = yarray + 11 * ridx[i]; 2689 } 2690 sum1 = y[0]; 2691 sum2 = y[1]; 2692 sum3 = y[2]; 2693 sum4 = y[3]; 2694 sum5 = y[4]; 2695 sum6 = y[5]; 2696 sum7 = y[6]; 2697 sum8 = y[7]; 2698 sum9 = y[8]; 2699 sum10 = y[9]; 2700 sum11 = y[10]; 2701 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2702 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2703 for (j = 0; j < n; j++) { 2704 xb = x + 11 * (*idx++); 2705 x1 = xb[0]; 2706 x2 = xb[1]; 2707 x3 = xb[2]; 2708 x4 = xb[3]; 2709 x5 = xb[4]; 2710 x6 = xb[5]; 2711 x7 = xb[6]; 2712 x8 = xb[7]; 2713 x9 = xb[8]; 2714 x10 = xb[9]; 2715 x11 = xb[10]; 2716 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2717 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2718 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2719 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2720 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2721 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2722 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2723 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2724 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2725 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2726 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2727 v += 121; 2728 } 2729 z[0] = sum1; 2730 z[1] = sum2; 2731 z[2] = sum3; 2732 z[3] = sum4; 2733 z[4] = sum5; 2734 z[5] = sum6; 2735 z[6] = sum7; 2736 z[7] = sum8; 2737 z[8] = sum9; 2738 z[9] = sum10; 2739 z[10] = sum11; 2740 if (!usecprow) { 2741 z += 11; 2742 y += 11; 2743 } 2744 } 2745 PetscCall(VecRestoreArrayRead(xx, &x)); 2746 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 2747 PetscCall(PetscLogFlops(242.0 * a->nz)); 2748 PetscFunctionReturn(PETSC_SUCCESS); 2749 } 2750 2751 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2752 { 2753 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2754 PetscScalar *z = NULL, *work, *workt, *zarray; 2755 const PetscScalar *x, *xb; 2756 const MatScalar *v; 2757 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2758 PetscInt ncols, k; 2759 const PetscInt *ridx = NULL, *idx, *ii; 2760 PetscBool usecprow = a->compressedrow.use; 2761 2762 PetscFunctionBegin; 2763 PetscCall(VecCopy(yy, zz)); 2764 PetscCall(VecGetArrayRead(xx, &x)); 2765 PetscCall(VecGetArray(zz, &zarray)); 2766 2767 idx = a->j; 2768 v = a->a; 2769 if (usecprow) { 2770 mbs = a->compressedrow.nrows; 2771 ii = a->compressedrow.i; 2772 ridx = a->compressedrow.rindex; 2773 } else { 2774 mbs = a->mbs; 2775 ii = a->i; 2776 z = zarray; 2777 } 2778 2779 if (!a->mult_work) { 2780 k = PetscMax(A->rmap->n, A->cmap->n); 2781 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 2782 } 2783 work = a->mult_work; 2784 for (i = 0; i < mbs; i++) { 2785 n = ii[1] - ii[0]; 2786 ii++; 2787 ncols = n * bs; 2788 workt = work; 2789 for (j = 0; j < n; j++) { 2790 xb = x + bs * (*idx++); 2791 for (k = 0; k < bs; k++) workt[k] = xb[k]; 2792 workt += bs; 2793 } 2794 if (usecprow) z = zarray + bs * ridx[i]; 2795 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 2796 v += n * bs2; 2797 if (!usecprow) z += bs; 2798 } 2799 PetscCall(VecRestoreArrayRead(xx, &x)); 2800 PetscCall(VecRestoreArray(zz, &zarray)); 2801 PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2802 PetscFunctionReturn(PETSC_SUCCESS); 2803 } 2804 2805 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2806 { 2807 PetscScalar zero = 0.0; 2808 2809 PetscFunctionBegin; 2810 PetscCall(VecSet(zz, zero)); 2811 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2812 PetscFunctionReturn(PETSC_SUCCESS); 2813 } 2814 2815 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2816 { 2817 PetscScalar zero = 0.0; 2818 2819 PetscFunctionBegin; 2820 PetscCall(VecSet(zz, zero)); 2821 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2822 PetscFunctionReturn(PETSC_SUCCESS); 2823 } 2824 2825 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2826 { 2827 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2828 PetscScalar *z, x1, x2, x3, x4, x5; 2829 const PetscScalar *x, *xb = NULL; 2830 const MatScalar *v; 2831 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2832 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2833 Mat_CompressedRow cprow = a->compressedrow; 2834 PetscBool usecprow = cprow.use; 2835 2836 PetscFunctionBegin; 2837 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2838 PetscCall(VecGetArrayRead(xx, &x)); 2839 PetscCall(VecGetArray(zz, &z)); 2840 2841 idx = a->j; 2842 v = a->a; 2843 if (usecprow) { 2844 mbs = cprow.nrows; 2845 ii = cprow.i; 2846 ridx = cprow.rindex; 2847 } else { 2848 mbs = a->mbs; 2849 ii = a->i; 2850 xb = x; 2851 } 2852 2853 switch (bs) { 2854 case 1: 2855 for (i = 0; i < mbs; i++) { 2856 if (usecprow) xb = x + ridx[i]; 2857 x1 = xb[0]; 2858 ib = idx + ii[0]; 2859 n = ii[1] - ii[0]; 2860 ii++; 2861 for (j = 0; j < n; j++) { 2862 rval = ib[j]; 2863 z[rval] += PetscConj(*v) * x1; 2864 v++; 2865 } 2866 if (!usecprow) xb++; 2867 } 2868 break; 2869 case 2: 2870 for (i = 0; i < mbs; i++) { 2871 if (usecprow) xb = x + 2 * ridx[i]; 2872 x1 = xb[0]; 2873 x2 = xb[1]; 2874 ib = idx + ii[0]; 2875 n = ii[1] - ii[0]; 2876 ii++; 2877 for (j = 0; j < n; j++) { 2878 rval = ib[j] * 2; 2879 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2880 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2881 v += 4; 2882 } 2883 if (!usecprow) xb += 2; 2884 } 2885 break; 2886 case 3: 2887 for (i = 0; i < mbs; i++) { 2888 if (usecprow) xb = x + 3 * ridx[i]; 2889 x1 = xb[0]; 2890 x2 = xb[1]; 2891 x3 = xb[2]; 2892 ib = idx + ii[0]; 2893 n = ii[1] - ii[0]; 2894 ii++; 2895 for (j = 0; j < n; j++) { 2896 rval = ib[j] * 3; 2897 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2898 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2899 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2900 v += 9; 2901 } 2902 if (!usecprow) xb += 3; 2903 } 2904 break; 2905 case 4: 2906 for (i = 0; i < mbs; i++) { 2907 if (usecprow) xb = x + 4 * ridx[i]; 2908 x1 = xb[0]; 2909 x2 = xb[1]; 2910 x3 = xb[2]; 2911 x4 = xb[3]; 2912 ib = idx + ii[0]; 2913 n = ii[1] - ii[0]; 2914 ii++; 2915 for (j = 0; j < n; j++) { 2916 rval = ib[j] * 4; 2917 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2918 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2919 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2920 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2921 v += 16; 2922 } 2923 if (!usecprow) xb += 4; 2924 } 2925 break; 2926 case 5: 2927 for (i = 0; i < mbs; i++) { 2928 if (usecprow) xb = x + 5 * ridx[i]; 2929 x1 = xb[0]; 2930 x2 = xb[1]; 2931 x3 = xb[2]; 2932 x4 = xb[3]; 2933 x5 = xb[4]; 2934 ib = idx + ii[0]; 2935 n = ii[1] - ii[0]; 2936 ii++; 2937 for (j = 0; j < n; j++) { 2938 rval = ib[j] * 5; 2939 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2940 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2941 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2942 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2943 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2944 v += 25; 2945 } 2946 if (!usecprow) xb += 5; 2947 } 2948 break; 2949 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2950 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2951 #if 0 2952 { 2953 PetscInt ncols,k,bs2=a->bs2; 2954 PetscScalar *work,*workt,zb; 2955 const PetscScalar *xtmp; 2956 if (!a->mult_work) { 2957 k = PetscMax(A->rmap->n,A->cmap->n); 2958 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2959 } 2960 work = a->mult_work; 2961 xtmp = x; 2962 for (i=0; i<mbs; i++) { 2963 n = ii[1] - ii[0]; ii++; 2964 ncols = n*bs; 2965 PetscCall(PetscArrayzero(work,ncols)); 2966 if (usecprow) xtmp = x + bs*ridx[i]; 2967 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2968 v += n*bs2; 2969 if (!usecprow) xtmp += bs; 2970 workt = work; 2971 for (j=0; j<n; j++) { 2972 zb = z + bs*(*idx++); 2973 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2974 workt += bs; 2975 } 2976 } 2977 } 2978 #endif 2979 } 2980 PetscCall(VecRestoreArrayRead(xx, &x)); 2981 PetscCall(VecRestoreArray(zz, &z)); 2982 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 2983 PetscFunctionReturn(PETSC_SUCCESS); 2984 } 2985 2986 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2987 { 2988 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2989 PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2990 const PetscScalar *x, *xb = NULL; 2991 const MatScalar *v; 2992 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2993 const PetscInt *idx, *ii, *ib, *ridx = NULL; 2994 Mat_CompressedRow cprow = a->compressedrow; 2995 PetscBool usecprow = cprow.use; 2996 2997 PetscFunctionBegin; 2998 if (yy != zz) PetscCall(VecCopy(yy, zz)); 2999 PetscCall(VecGetArrayRead(xx, &x)); 3000 PetscCall(VecGetArray(zz, &z)); 3001 3002 idx = a->j; 3003 v = a->a; 3004 if (usecprow) { 3005 mbs = cprow.nrows; 3006 ii = cprow.i; 3007 ridx = cprow.rindex; 3008 } else { 3009 mbs = a->mbs; 3010 ii = a->i; 3011 xb = x; 3012 } 3013 3014 switch (bs) { 3015 case 1: 3016 for (i = 0; i < mbs; i++) { 3017 if (usecprow) xb = x + ridx[i]; 3018 x1 = xb[0]; 3019 ib = idx + ii[0]; 3020 n = ii[1] - ii[0]; 3021 ii++; 3022 for (j = 0; j < n; j++) { 3023 rval = ib[j]; 3024 z[rval] += *v * x1; 3025 v++; 3026 } 3027 if (!usecprow) xb++; 3028 } 3029 break; 3030 case 2: 3031 for (i = 0; i < mbs; i++) { 3032 if (usecprow) xb = x + 2 * ridx[i]; 3033 x1 = xb[0]; 3034 x2 = xb[1]; 3035 ib = idx + ii[0]; 3036 n = ii[1] - ii[0]; 3037 ii++; 3038 for (j = 0; j < n; j++) { 3039 rval = ib[j] * 2; 3040 z[rval++] += v[0] * x1 + v[1] * x2; 3041 z[rval++] += v[2] * x1 + v[3] * x2; 3042 v += 4; 3043 } 3044 if (!usecprow) xb += 2; 3045 } 3046 break; 3047 case 3: 3048 for (i = 0; i < mbs; i++) { 3049 if (usecprow) xb = x + 3 * ridx[i]; 3050 x1 = xb[0]; 3051 x2 = xb[1]; 3052 x3 = xb[2]; 3053 ib = idx + ii[0]; 3054 n = ii[1] - ii[0]; 3055 ii++; 3056 for (j = 0; j < n; j++) { 3057 rval = ib[j] * 3; 3058 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 3059 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 3060 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3061 v += 9; 3062 } 3063 if (!usecprow) xb += 3; 3064 } 3065 break; 3066 case 4: 3067 for (i = 0; i < mbs; i++) { 3068 if (usecprow) xb = x + 4 * ridx[i]; 3069 x1 = xb[0]; 3070 x2 = xb[1]; 3071 x3 = xb[2]; 3072 x4 = xb[3]; 3073 ib = idx + ii[0]; 3074 n = ii[1] - ii[0]; 3075 ii++; 3076 for (j = 0; j < n; j++) { 3077 rval = ib[j] * 4; 3078 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 3079 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 3080 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 3081 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3082 v += 16; 3083 } 3084 if (!usecprow) xb += 4; 3085 } 3086 break; 3087 case 5: 3088 for (i = 0; i < mbs; i++) { 3089 if (usecprow) xb = x + 5 * ridx[i]; 3090 x1 = xb[0]; 3091 x2 = xb[1]; 3092 x3 = xb[2]; 3093 x4 = xb[3]; 3094 x5 = xb[4]; 3095 ib = idx + ii[0]; 3096 n = ii[1] - ii[0]; 3097 ii++; 3098 for (j = 0; j < n; j++) { 3099 rval = ib[j] * 5; 3100 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 3101 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 3102 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 3103 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 3104 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 3105 v += 25; 3106 } 3107 if (!usecprow) xb += 5; 3108 } 3109 break; 3110 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3111 PetscInt ncols, k; 3112 PetscScalar *work, *workt; 3113 const PetscScalar *xtmp; 3114 if (!a->mult_work) { 3115 k = PetscMax(A->rmap->n, A->cmap->n); 3116 PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 3117 } 3118 work = a->mult_work; 3119 xtmp = x; 3120 for (i = 0; i < mbs; i++) { 3121 n = ii[1] - ii[0]; 3122 ii++; 3123 ncols = n * bs; 3124 PetscCall(PetscArrayzero(work, ncols)); 3125 if (usecprow) xtmp = x + bs * ridx[i]; 3126 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 3127 v += n * bs2; 3128 if (!usecprow) xtmp += bs; 3129 workt = work; 3130 for (j = 0; j < n; j++) { 3131 zb = z + bs * (*idx++); 3132 for (k = 0; k < bs; k++) zb[k] += workt[k]; 3133 workt += bs; 3134 } 3135 } 3136 } 3137 } 3138 PetscCall(VecRestoreArrayRead(xx, &x)); 3139 PetscCall(VecRestoreArray(zz, &z)); 3140 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3141 PetscFunctionReturn(PETSC_SUCCESS); 3142 } 3143 3144 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3145 { 3146 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3147 PetscInt totalnz = a->bs2 * a->nz; 3148 PetscScalar oalpha = alpha; 3149 PetscBLASInt one = 1, tnz; 3150 3151 PetscFunctionBegin; 3152 PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3153 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 3154 PetscCall(PetscLogFlops(totalnz)); 3155 PetscFunctionReturn(PETSC_SUCCESS); 3156 } 3157 3158 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3159 { 3160 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3161 MatScalar *v = a->a; 3162 PetscReal sum = 0.0; 3163 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 3164 3165 PetscFunctionBegin; 3166 if (type == NORM_FROBENIUS) { 3167 #if defined(PETSC_USE_REAL___FP16) 3168 PetscBLASInt one = 1, cnt = bs2 * nz; 3169 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3170 #else 3171 for (i = 0; i < bs2 * nz; i++) { 3172 sum += PetscRealPart(PetscConj(*v) * (*v)); 3173 v++; 3174 } 3175 #endif 3176 *norm = PetscSqrtReal(sum); 3177 PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 3178 } else if (type == NORM_1) { /* maximum column sum */ 3179 PetscReal *tmp; 3180 PetscInt *bcol = a->j; 3181 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 3182 for (i = 0; i < nz; i++) { 3183 for (j = 0; j < bs; j++) { 3184 k1 = bs * (*bcol) + j; /* column index */ 3185 for (k = 0; k < bs; k++) { 3186 tmp[k1] += PetscAbsScalar(*v); 3187 v++; 3188 } 3189 } 3190 bcol++; 3191 } 3192 *norm = 0.0; 3193 for (j = 0; j < A->cmap->n; j++) { 3194 if (tmp[j] > *norm) *norm = tmp[j]; 3195 } 3196 PetscCall(PetscFree(tmp)); 3197 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3198 } else if (type == NORM_INFINITY) { /* maximum row sum */ 3199 *norm = 0.0; 3200 for (k = 0; k < bs; k++) { 3201 for (j = 0; j < a->mbs; j++) { 3202 v = a->a + bs2 * a->i[j] + k; 3203 sum = 0.0; 3204 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 3205 for (k1 = 0; k1 < bs; k1++) { 3206 sum += PetscAbsScalar(*v); 3207 v += bs; 3208 } 3209 } 3210 if (sum > *norm) *norm = sum; 3211 } 3212 } 3213 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3214 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3215 PetscFunctionReturn(PETSC_SUCCESS); 3216 } 3217 3218 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3219 { 3220 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 3221 3222 PetscFunctionBegin; 3223 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3224 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3225 *flg = PETSC_FALSE; 3226 PetscFunctionReturn(PETSC_SUCCESS); 3227 } 3228 3229 /* if the a->i are the same */ 3230 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3231 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3232 3233 /* if a->j are the same */ 3234 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3235 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 3236 3237 /* if a->a are the same */ 3238 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3239 PetscFunctionReturn(PETSC_SUCCESS); 3240 } 3241 3242 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3243 { 3244 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3245 PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 3246 PetscScalar *x, zero = 0.0; 3247 MatScalar *aa, *aa_j; 3248 3249 PetscFunctionBegin; 3250 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3251 bs = A->rmap->bs; 3252 aa = a->a; 3253 ai = a->i; 3254 aj = a->j; 3255 ambs = a->mbs; 3256 bs2 = a->bs2; 3257 3258 PetscCall(VecSet(v, zero)); 3259 PetscCall(VecGetArray(v, &x)); 3260 PetscCall(VecGetLocalSize(v, &n)); 3261 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 3262 for (i = 0; i < ambs; i++) { 3263 for (j = ai[i]; j < ai[i + 1]; j++) { 3264 if (aj[j] == i) { 3265 row = i * bs; 3266 aa_j = aa + j * bs2; 3267 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 3268 break; 3269 } 3270 } 3271 } 3272 PetscCall(VecRestoreArray(v, &x)); 3273 PetscFunctionReturn(PETSC_SUCCESS); 3274 } 3275 3276 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3277 { 3278 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3279 const PetscScalar *l, *r, *li, *ri; 3280 PetscScalar x; 3281 MatScalar *aa, *v; 3282 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 3283 const PetscInt *ai, *aj; 3284 3285 PetscFunctionBegin; 3286 ai = a->i; 3287 aj = a->j; 3288 aa = a->a; 3289 m = A->rmap->n; 3290 n = A->cmap->n; 3291 bs = A->rmap->bs; 3292 mbs = a->mbs; 3293 bs2 = a->bs2; 3294 if (ll) { 3295 PetscCall(VecGetArrayRead(ll, &l)); 3296 PetscCall(VecGetLocalSize(ll, &lm)); 3297 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 3298 for (i = 0; i < mbs; i++) { /* for each block row */ 3299 M = ai[i + 1] - ai[i]; 3300 li = l + i * bs; 3301 v = PetscSafePointerPlusOffset(aa, bs2 * ai[i]); 3302 for (j = 0; j < M; j++) { /* for each block */ 3303 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 3304 } 3305 } 3306 PetscCall(VecRestoreArrayRead(ll, &l)); 3307 PetscCall(PetscLogFlops(a->nz)); 3308 } 3309 3310 if (rr) { 3311 PetscCall(VecGetArrayRead(rr, &r)); 3312 PetscCall(VecGetLocalSize(rr, &rn)); 3313 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 3314 for (i = 0; i < mbs; i++) { /* for each block row */ 3315 iai = ai[i]; 3316 M = ai[i + 1] - iai; 3317 v = PetscSafePointerPlusOffset(aa, bs2 * iai); 3318 for (j = 0; j < M; j++) { /* for each block */ 3319 ri = r + bs * aj[iai + j]; 3320 for (k = 0; k < bs; k++) { 3321 x = ri[k]; 3322 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 3323 v += bs; 3324 } 3325 } 3326 } 3327 PetscCall(VecRestoreArrayRead(rr, &r)); 3328 PetscCall(PetscLogFlops(a->nz)); 3329 } 3330 PetscFunctionReturn(PETSC_SUCCESS); 3331 } 3332 3333 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3334 { 3335 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3336 3337 PetscFunctionBegin; 3338 info->block_size = a->bs2; 3339 info->nz_allocated = a->bs2 * a->maxnz; 3340 info->nz_used = a->bs2 * a->nz; 3341 info->nz_unneeded = info->nz_allocated - info->nz_used; 3342 info->assemblies = A->num_ass; 3343 info->mallocs = A->info.mallocs; 3344 info->memory = 0; /* REVIEW ME */ 3345 if (A->factortype) { 3346 info->fill_ratio_given = A->info.fill_ratio_given; 3347 info->fill_ratio_needed = A->info.fill_ratio_needed; 3348 info->factor_mallocs = A->info.factor_mallocs; 3349 } else { 3350 info->fill_ratio_given = 0; 3351 info->fill_ratio_needed = 0; 3352 info->factor_mallocs = 0; 3353 } 3354 PetscFunctionReturn(PETSC_SUCCESS); 3355 } 3356 3357 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3358 { 3359 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3360 3361 PetscFunctionBegin; 3362 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3363 PetscFunctionReturn(PETSC_SUCCESS); 3364 } 3365 3366 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3367 { 3368 PetscFunctionBegin; 3369 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 3370 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3371 PetscFunctionReturn(PETSC_SUCCESS); 3372 } 3373 3374 static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3375 { 3376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3377 PetscScalar *z = NULL, sum1; 3378 const PetscScalar *xb; 3379 PetscScalar x1; 3380 const MatScalar *v, *vv; 3381 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3382 PetscBool usecprow = a->compressedrow.use; 3383 3384 PetscFunctionBegin; 3385 idx = a->j; 3386 v = a->a; 3387 if (usecprow) { 3388 mbs = a->compressedrow.nrows; 3389 ii = a->compressedrow.i; 3390 ridx = a->compressedrow.rindex; 3391 } else { 3392 mbs = a->mbs; 3393 ii = a->i; 3394 z = c; 3395 } 3396 3397 for (i = 0; i < mbs; i++) { 3398 n = ii[1] - ii[0]; 3399 ii++; 3400 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3401 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3402 if (usecprow) z = c + ridx[i]; 3403 jj = idx; 3404 vv = v; 3405 for (k = 0; k < cn; k++) { 3406 idx = jj; 3407 v = vv; 3408 sum1 = 0.0; 3409 for (j = 0; j < n; j++) { 3410 xb = b + (*idx++); 3411 x1 = xb[0 + k * bm]; 3412 sum1 += v[0] * x1; 3413 v += 1; 3414 } 3415 z[0 + k * cm] = sum1; 3416 } 3417 if (!usecprow) z += 1; 3418 } 3419 PetscFunctionReturn(PETSC_SUCCESS); 3420 } 3421 3422 static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3423 { 3424 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3425 PetscScalar *z = NULL, sum1, sum2; 3426 const PetscScalar *xb; 3427 PetscScalar x1, x2; 3428 const MatScalar *v, *vv; 3429 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3430 PetscBool usecprow = a->compressedrow.use; 3431 3432 PetscFunctionBegin; 3433 idx = a->j; 3434 v = a->a; 3435 if (usecprow) { 3436 mbs = a->compressedrow.nrows; 3437 ii = a->compressedrow.i; 3438 ridx = a->compressedrow.rindex; 3439 } else { 3440 mbs = a->mbs; 3441 ii = a->i; 3442 z = c; 3443 } 3444 3445 for (i = 0; i < mbs; i++) { 3446 n = ii[1] - ii[0]; 3447 ii++; 3448 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3449 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3450 if (usecprow) z = c + 2 * ridx[i]; 3451 jj = idx; 3452 vv = v; 3453 for (k = 0; k < cn; k++) { 3454 idx = jj; 3455 v = vv; 3456 sum1 = 0.0; 3457 sum2 = 0.0; 3458 for (j = 0; j < n; j++) { 3459 xb = b + 2 * (*idx++); 3460 x1 = xb[0 + k * bm]; 3461 x2 = xb[1 + k * bm]; 3462 sum1 += v[0] * x1 + v[2] * x2; 3463 sum2 += v[1] * x1 + v[3] * x2; 3464 v += 4; 3465 } 3466 z[0 + k * cm] = sum1; 3467 z[1 + k * cm] = sum2; 3468 } 3469 if (!usecprow) z += 2; 3470 } 3471 PetscFunctionReturn(PETSC_SUCCESS); 3472 } 3473 3474 static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3475 { 3476 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3477 PetscScalar *z = NULL, sum1, sum2, sum3; 3478 const PetscScalar *xb; 3479 PetscScalar x1, x2, x3; 3480 const MatScalar *v, *vv; 3481 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3482 PetscBool usecprow = a->compressedrow.use; 3483 3484 PetscFunctionBegin; 3485 idx = a->j; 3486 v = a->a; 3487 if (usecprow) { 3488 mbs = a->compressedrow.nrows; 3489 ii = a->compressedrow.i; 3490 ridx = a->compressedrow.rindex; 3491 } else { 3492 mbs = a->mbs; 3493 ii = a->i; 3494 z = c; 3495 } 3496 3497 for (i = 0; i < mbs; i++) { 3498 n = ii[1] - ii[0]; 3499 ii++; 3500 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3501 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3502 if (usecprow) z = c + 3 * ridx[i]; 3503 jj = idx; 3504 vv = v; 3505 for (k = 0; k < cn; k++) { 3506 idx = jj; 3507 v = vv; 3508 sum1 = 0.0; 3509 sum2 = 0.0; 3510 sum3 = 0.0; 3511 for (j = 0; j < n; j++) { 3512 xb = b + 3 * (*idx++); 3513 x1 = xb[0 + k * bm]; 3514 x2 = xb[1 + k * bm]; 3515 x3 = xb[2 + k * bm]; 3516 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3517 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3518 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3519 v += 9; 3520 } 3521 z[0 + k * cm] = sum1; 3522 z[1 + k * cm] = sum2; 3523 z[2 + k * cm] = sum3; 3524 } 3525 if (!usecprow) z += 3; 3526 } 3527 PetscFunctionReturn(PETSC_SUCCESS); 3528 } 3529 3530 static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3531 { 3532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3533 PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3534 const PetscScalar *xb; 3535 PetscScalar x1, x2, x3, x4; 3536 const MatScalar *v, *vv; 3537 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3538 PetscBool usecprow = a->compressedrow.use; 3539 3540 PetscFunctionBegin; 3541 idx = a->j; 3542 v = a->a; 3543 if (usecprow) { 3544 mbs = a->compressedrow.nrows; 3545 ii = a->compressedrow.i; 3546 ridx = a->compressedrow.rindex; 3547 } else { 3548 mbs = a->mbs; 3549 ii = a->i; 3550 z = c; 3551 } 3552 3553 for (i = 0; i < mbs; i++) { 3554 n = ii[1] - ii[0]; 3555 ii++; 3556 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3557 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3558 if (usecprow) z = c + 4 * ridx[i]; 3559 jj = idx; 3560 vv = v; 3561 for (k = 0; k < cn; k++) { 3562 idx = jj; 3563 v = vv; 3564 sum1 = 0.0; 3565 sum2 = 0.0; 3566 sum3 = 0.0; 3567 sum4 = 0.0; 3568 for (j = 0; j < n; j++) { 3569 xb = b + 4 * (*idx++); 3570 x1 = xb[0 + k * bm]; 3571 x2 = xb[1 + k * bm]; 3572 x3 = xb[2 + k * bm]; 3573 x4 = xb[3 + k * bm]; 3574 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 3575 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 3576 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 3577 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 3578 v += 16; 3579 } 3580 z[0 + k * cm] = sum1; 3581 z[1 + k * cm] = sum2; 3582 z[2 + k * cm] = sum3; 3583 z[3 + k * cm] = sum4; 3584 } 3585 if (!usecprow) z += 4; 3586 } 3587 PetscFunctionReturn(PETSC_SUCCESS); 3588 } 3589 3590 static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3591 { 3592 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3593 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3594 const PetscScalar *xb; 3595 PetscScalar x1, x2, x3, x4, x5; 3596 const MatScalar *v, *vv; 3597 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 3598 PetscBool usecprow = a->compressedrow.use; 3599 3600 PetscFunctionBegin; 3601 idx = a->j; 3602 v = a->a; 3603 if (usecprow) { 3604 mbs = a->compressedrow.nrows; 3605 ii = a->compressedrow.i; 3606 ridx = a->compressedrow.rindex; 3607 } else { 3608 mbs = a->mbs; 3609 ii = a->i; 3610 z = c; 3611 } 3612 3613 for (i = 0; i < mbs; i++) { 3614 n = ii[1] - ii[0]; 3615 ii++; 3616 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3617 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3618 if (usecprow) z = c + 5 * ridx[i]; 3619 jj = idx; 3620 vv = v; 3621 for (k = 0; k < cn; k++) { 3622 idx = jj; 3623 v = vv; 3624 sum1 = 0.0; 3625 sum2 = 0.0; 3626 sum3 = 0.0; 3627 sum4 = 0.0; 3628 sum5 = 0.0; 3629 for (j = 0; j < n; j++) { 3630 xb = b + 5 * (*idx++); 3631 x1 = xb[0 + k * bm]; 3632 x2 = xb[1 + k * bm]; 3633 x3 = xb[2 + k * bm]; 3634 x4 = xb[3 + k * bm]; 3635 x5 = xb[4 + k * bm]; 3636 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 3637 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 3638 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 3639 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 3640 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 3641 v += 25; 3642 } 3643 z[0 + k * cm] = sum1; 3644 z[1 + k * cm] = sum2; 3645 z[2 + k * cm] = sum3; 3646 z[3 + k * cm] = sum4; 3647 z[4 + k * cm] = sum5; 3648 } 3649 if (!usecprow) z += 5; 3650 } 3651 PetscFunctionReturn(PETSC_SUCCESS); 3652 } 3653 3654 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3655 { 3656 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3657 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3658 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3659 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3660 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3661 PetscBLASInt bbs, bcn, bbm, bcm; 3662 PetscScalar *z = NULL; 3663 PetscScalar *c, *b; 3664 const MatScalar *v; 3665 const PetscInt *idx, *ii, *ridx = NULL; 3666 PetscScalar _DZero = 0.0, _DOne = 1.0; 3667 PetscBool usecprow = a->compressedrow.use; 3668 3669 PetscFunctionBegin; 3670 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 3671 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 3672 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 3673 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3674 b = bd->v; 3675 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 3676 PetscCall(MatDenseGetArray(C, &c)); 3677 switch (bs) { 3678 case 1: 3679 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3680 break; 3681 case 2: 3682 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3683 break; 3684 case 3: 3685 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3686 break; 3687 case 4: 3688 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3689 break; 3690 case 5: 3691 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3692 break; 3693 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3694 PetscCall(PetscBLASIntCast(bs, &bbs)); 3695 PetscCall(PetscBLASIntCast(cn, &bcn)); 3696 PetscCall(PetscBLASIntCast(bm, &bbm)); 3697 PetscCall(PetscBLASIntCast(cm, &bcm)); 3698 idx = a->j; 3699 v = a->a; 3700 if (usecprow) { 3701 mbs = a->compressedrow.nrows; 3702 ii = a->compressedrow.i; 3703 ridx = a->compressedrow.rindex; 3704 } else { 3705 mbs = a->mbs; 3706 ii = a->i; 3707 z = c; 3708 } 3709 for (i = 0; i < mbs; i++) { 3710 n = ii[1] - ii[0]; 3711 ii++; 3712 if (usecprow) z = c + bs * ridx[i]; 3713 if (n) { 3714 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 3715 v += bs2; 3716 } 3717 for (j = 1; j < n; j++) { 3718 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3719 v += bs2; 3720 } 3721 if (!usecprow) z += bs; 3722 } 3723 } 3724 PetscCall(MatDenseRestoreArray(C, &c)); 3725 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3726 PetscFunctionReturn(PETSC_SUCCESS); 3727 } 3728