1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 13 const PetscInt *idx; 14 PetscBT table_out, table_in; 15 16 PetscFunctionBegin; 17 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 PetscCall(PetscBTCreate(mbs, &table_out)); 23 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 24 PetscCall(PetscBTCreate(mbs, &table_in)); 25 26 for (i = 0; i < is_max; i++) { /* for each is */ 27 isz = 0; 28 PetscCall(PetscBTMemzero(mbs, table_out)); 29 30 /* Extract the indices, assume there can be duplicate entries */ 31 PetscCall(ISGetIndices(is[i], &idx)); 32 PetscCall(ISGetLocalSize(is[i], &n)); 33 34 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35 bcol_max = 0; 36 for (j = 0; j < n; ++j) { 37 brow = idx[j] / bs; /* convert the indices into block indices */ 38 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39 if (!PetscBTLookupSet(table_out, brow)) { 40 nidx[isz++] = brow; 41 if (bcol_max < brow) bcol_max = brow; 42 } 43 } 44 PetscCall(ISRestoreIndices(is[i], &idx)); 45 PetscCall(ISDestroy(&is[i])); 46 47 k = 0; 48 for (j = 0; j < ov; j++) { /* for each overlap */ 49 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 50 PetscCall(PetscBTMemzero(mbs, table_in)); 51 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52 53 n = isz; /* length of the updated is[i] */ 54 for (brow = 0; brow < mbs; brow++) { 55 start = ai[brow]; 56 end = ai[brow + 1]; 57 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l < end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out, bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l < end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in, bcol)) { 72 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 80 } /* for each is */ 81 PetscCall(PetscBTDestroy(&table_out)); 82 PetscCall(PetscFree(nidx)); 83 PetscCall(PetscBTDestroy(&table_in)); 84 PetscFunctionReturn(0); 85 } 86 87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 88 Zero some ops' to avoid invalid usse */ 89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90 { 91 PetscFunctionBegin; 92 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93 Bseq->ops->mult = NULL; 94 Bseq->ops->multadd = NULL; 95 Bseq->ops->multtranspose = NULL; 96 Bseq->ops->multtransposeadd = NULL; 97 Bseq->ops->lufactor = NULL; 98 Bseq->ops->choleskyfactor = NULL; 99 Bseq->ops->lufactorsymbolic = NULL; 100 Bseq->ops->choleskyfactorsymbolic = NULL; 101 Bseq->ops->getinertia = NULL; 102 PetscFunctionReturn(0); 103 } 104 105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 107 { 108 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111 const PetscInt *irow, *icol; 112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113 PetscInt *aj = a->j, *ai = a->i; 114 MatScalar *mat_a; 115 Mat C; 116 PetscBool flag; 117 118 PetscFunctionBegin; 119 120 PetscCall(ISGetIndices(isrow, &irow)); 121 PetscCall(ISGetIndices(iscol, &icol)); 122 PetscCall(ISGetLocalSize(isrow, &nrows)); 123 PetscCall(ISGetLocalSize(iscol, &ncols)); 124 125 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126 ssmap = smap; 127 PetscCall(PetscMalloc1(1 + nrows, &lens)); 128 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129 /* determine lens of each row */ 130 for (i = 0; i < nrows; i++) { 131 kstart = ai[irow[i]]; 132 kend = kstart + a->ilen[irow[i]]; 133 lens[i] = 0; 134 for (k = kstart; k < kend; k++) { 135 if (ssmap[aj[k]]) lens[i]++; 136 } 137 } 138 /* Create and fill new matrix */ 139 if (scall == MAT_REUSE_MATRIX) { 140 c = (Mat_SeqSBAIJ *)((*B)->data); 141 142 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 143 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 144 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 145 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146 C = *B; 147 } else { 148 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 149 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 150 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 151 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 152 } 153 c = (Mat_SeqSBAIJ *)(C->data); 154 for (i = 0; i < nrows; i++) { 155 row = irow[i]; 156 kstart = ai[row]; 157 kend = kstart + a->ilen[row]; 158 mat_i = c->i[i]; 159 mat_j = c->j + mat_i; 160 mat_a = c->a + mat_i * bs2; 161 mat_ilen = c->ilen + i; 162 for (k = kstart; k < kend; k++) { 163 if ((tcol = ssmap[a->j[k]])) { 164 *mat_j++ = tcol - 1; 165 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 166 mat_a += bs2; 167 (*mat_ilen)++; 168 } 169 } 170 } 171 /* sort */ 172 { 173 MatScalar *work; 174 175 PetscCall(PetscMalloc1(bs2, &work)); 176 for (i = 0; i < nrows; i++) { 177 PetscInt ilen; 178 mat_i = c->i[i]; 179 mat_j = c->j + mat_i; 180 mat_a = c->a + mat_i * bs2; 181 ilen = c->ilen[i]; 182 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 183 } 184 PetscCall(PetscFree(work)); 185 } 186 187 /* Free work space */ 188 PetscCall(ISRestoreIndices(iscol, &icol)); 189 PetscCall(PetscFree(smap)); 190 PetscCall(PetscFree(lens)); 191 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 192 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 193 194 PetscCall(ISRestoreIndices(isrow, &irow)); 195 *B = C; 196 PetscFunctionReturn(0); 197 } 198 199 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 200 { 201 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 202 IS is1, is2; 203 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 204 const PetscInt *irow, *icol; 205 206 PetscFunctionBegin; 207 PetscCall(ISGetIndices(isrow, &irow)); 208 PetscCall(ISGetIndices(iscol, &icol)); 209 PetscCall(ISGetLocalSize(isrow, &nrows)); 210 PetscCall(ISGetLocalSize(iscol, &ncols)); 211 212 /* Verify if the indices corespond to each element in a block 213 and form the IS with compressed IS */ 214 maxmnbs = PetscMax(a->mbs, a->nbs); 215 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 216 PetscCall(PetscArrayzero(vary, a->mbs)); 217 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 218 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 219 count = 0; 220 for (i = 0; i < nrows; i++) { 221 PetscInt j = irow[i] / bs; 222 if ((vary[j]--) == bs) iary[count++] = j; 223 } 224 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 225 226 PetscCall(PetscArrayzero(vary, a->nbs)); 227 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 228 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 229 count = 0; 230 for (i = 0; i < ncols; i++) { 231 PetscInt j = icol[i] / bs; 232 if ((vary[j]--) == bs) iary[count++] = j; 233 } 234 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 235 PetscCall(ISRestoreIndices(isrow, &irow)); 236 PetscCall(ISRestoreIndices(iscol, &icol)); 237 PetscCall(PetscFree2(vary, iary)); 238 239 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 240 PetscCall(ISDestroy(&is1)); 241 PetscCall(ISDestroy(&is2)); 242 243 if (isrow != iscol) { 244 PetscBool isequal; 245 PetscCall(ISEqual(isrow, iscol, &isequal)); 246 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 247 } 248 PetscFunctionReturn(0); 249 } 250 251 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 252 { 253 PetscInt i; 254 255 PetscFunctionBegin; 256 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 257 258 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 259 PetscFunctionReturn(0); 260 } 261 262 /* -------------------------------------------------------*/ 263 /* Should check that shapes of vectors and matrices match */ 264 /* -------------------------------------------------------*/ 265 266 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 267 { 268 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 269 PetscScalar *z, x1, x2, zero = 0.0; 270 const PetscScalar *x, *xb; 271 const MatScalar *v; 272 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 273 const PetscInt *aj = a->j, *ai = a->i, *ib; 274 PetscInt nonzerorow = 0; 275 276 PetscFunctionBegin; 277 PetscCall(VecSet(zz, zero)); 278 if (!a->nz) PetscFunctionReturn(0); 279 PetscCall(VecGetArrayRead(xx, &x)); 280 PetscCall(VecGetArray(zz, &z)); 281 282 v = a->a; 283 xb = x; 284 285 for (i = 0; i < mbs; i++) { 286 n = ai[1] - ai[0]; /* length of i_th block row of A */ 287 x1 = xb[0]; 288 x2 = xb[1]; 289 ib = aj + *ai; 290 jmin = 0; 291 nonzerorow += (n > 0); 292 if (*ib == i) { /* (diag of A)*x */ 293 z[2 * i] += v[0] * x1 + v[2] * x2; 294 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 295 v += 4; 296 jmin++; 297 } 298 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 299 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 300 for (j = jmin; j < n; j++) { 301 /* (strict lower triangular part of A)*x */ 302 cval = ib[j] * 2; 303 z[cval] += v[0] * x1 + v[1] * x2; 304 z[cval + 1] += v[2] * x1 + v[3] * x2; 305 /* (strict upper triangular part of A)*x */ 306 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 307 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 308 v += 4; 309 } 310 xb += 2; 311 ai++; 312 } 313 314 PetscCall(VecRestoreArrayRead(xx, &x)); 315 PetscCall(VecRestoreArray(zz, &z)); 316 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 317 PetscFunctionReturn(0); 318 } 319 320 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 321 { 322 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 323 PetscScalar *z, x1, x2, x3, zero = 0.0; 324 const PetscScalar *x, *xb; 325 const MatScalar *v; 326 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 327 const PetscInt *aj = a->j, *ai = a->i, *ib; 328 PetscInt nonzerorow = 0; 329 330 PetscFunctionBegin; 331 PetscCall(VecSet(zz, zero)); 332 if (!a->nz) PetscFunctionReturn(0); 333 PetscCall(VecGetArrayRead(xx, &x)); 334 PetscCall(VecGetArray(zz, &z)); 335 336 v = a->a; 337 xb = x; 338 339 for (i = 0; i < mbs; i++) { 340 n = ai[1] - ai[0]; /* length of i_th block row of A */ 341 x1 = xb[0]; 342 x2 = xb[1]; 343 x3 = xb[2]; 344 ib = aj + *ai; 345 jmin = 0; 346 nonzerorow += (n > 0); 347 if (*ib == i) { /* (diag of A)*x */ 348 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 349 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 350 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 351 v += 9; 352 jmin++; 353 } 354 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 356 for (j = jmin; j < n; j++) { 357 /* (strict lower triangular part of A)*x */ 358 cval = ib[j] * 3; 359 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 360 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 361 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 362 /* (strict upper triangular part of A)*x */ 363 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 364 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 365 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 366 v += 9; 367 } 368 xb += 3; 369 ai++; 370 } 371 372 PetscCall(VecRestoreArrayRead(xx, &x)); 373 PetscCall(VecRestoreArray(zz, &z)); 374 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 375 PetscFunctionReturn(0); 376 } 377 378 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 379 { 380 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 381 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 382 const PetscScalar *x, *xb; 383 const MatScalar *v; 384 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 385 const PetscInt *aj = a->j, *ai = a->i, *ib; 386 PetscInt nonzerorow = 0; 387 388 PetscFunctionBegin; 389 PetscCall(VecSet(zz, zero)); 390 if (!a->nz) PetscFunctionReturn(0); 391 PetscCall(VecGetArrayRead(xx, &x)); 392 PetscCall(VecGetArray(zz, &z)); 393 394 v = a->a; 395 xb = x; 396 397 for (i = 0; i < mbs; i++) { 398 n = ai[1] - ai[0]; /* length of i_th block row of A */ 399 x1 = xb[0]; 400 x2 = xb[1]; 401 x3 = xb[2]; 402 x4 = xb[3]; 403 ib = aj + *ai; 404 jmin = 0; 405 nonzerorow += (n > 0); 406 if (*ib == i) { /* (diag of A)*x */ 407 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 408 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 409 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 410 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 411 v += 16; 412 jmin++; 413 } 414 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 415 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 416 for (j = jmin; j < n; j++) { 417 /* (strict lower triangular part of A)*x */ 418 cval = ib[j] * 4; 419 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 420 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 421 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 422 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 423 /* (strict upper triangular part of A)*x */ 424 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 425 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 426 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 427 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 428 v += 16; 429 } 430 xb += 4; 431 ai++; 432 } 433 434 PetscCall(VecRestoreArrayRead(xx, &x)); 435 PetscCall(VecRestoreArray(zz, &z)); 436 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 437 PetscFunctionReturn(0); 438 } 439 440 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 441 { 442 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 443 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 444 const PetscScalar *x, *xb; 445 const MatScalar *v; 446 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 447 const PetscInt *aj = a->j, *ai = a->i, *ib; 448 PetscInt nonzerorow = 0; 449 450 PetscFunctionBegin; 451 PetscCall(VecSet(zz, zero)); 452 if (!a->nz) PetscFunctionReturn(0); 453 PetscCall(VecGetArrayRead(xx, &x)); 454 PetscCall(VecGetArray(zz, &z)); 455 456 v = a->a; 457 xb = x; 458 459 for (i = 0; i < mbs; i++) { 460 n = ai[1] - ai[0]; /* length of i_th block row of A */ 461 x1 = xb[0]; 462 x2 = xb[1]; 463 x3 = xb[2]; 464 x4 = xb[3]; 465 x5 = xb[4]; 466 ib = aj + *ai; 467 jmin = 0; 468 nonzerorow += (n > 0); 469 if (*ib == i) { /* (diag of A)*x */ 470 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 471 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 472 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 473 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 474 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 475 v += 25; 476 jmin++; 477 } 478 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 479 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 480 for (j = jmin; j < n; j++) { 481 /* (strict lower triangular part of A)*x */ 482 cval = ib[j] * 5; 483 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 484 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 485 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 486 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 487 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 488 /* (strict upper triangular part of A)*x */ 489 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 490 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 491 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 492 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 493 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 494 v += 25; 495 } 496 xb += 5; 497 ai++; 498 } 499 500 PetscCall(VecRestoreArrayRead(xx, &x)); 501 PetscCall(VecRestoreArray(zz, &z)); 502 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 503 PetscFunctionReturn(0); 504 } 505 506 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 507 { 508 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 509 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 510 const PetscScalar *x, *xb; 511 const MatScalar *v; 512 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 513 const PetscInt *aj = a->j, *ai = a->i, *ib; 514 PetscInt nonzerorow = 0; 515 516 PetscFunctionBegin; 517 PetscCall(VecSet(zz, zero)); 518 if (!a->nz) PetscFunctionReturn(0); 519 PetscCall(VecGetArrayRead(xx, &x)); 520 PetscCall(VecGetArray(zz, &z)); 521 522 v = a->a; 523 xb = x; 524 525 for (i = 0; i < mbs; i++) { 526 n = ai[1] - ai[0]; /* length of i_th block row of A */ 527 x1 = xb[0]; 528 x2 = xb[1]; 529 x3 = xb[2]; 530 x4 = xb[3]; 531 x5 = xb[4]; 532 x6 = xb[5]; 533 ib = aj + *ai; 534 jmin = 0; 535 nonzerorow += (n > 0); 536 if (*ib == i) { /* (diag of A)*x */ 537 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 538 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 539 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 540 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 541 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 542 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 543 v += 36; 544 jmin++; 545 } 546 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 547 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 548 for (j = jmin; j < n; j++) { 549 /* (strict lower triangular part of A)*x */ 550 cval = ib[j] * 6; 551 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 552 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 553 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 554 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 555 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 556 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 557 /* (strict upper triangular part of A)*x */ 558 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 559 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 560 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 561 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 562 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 563 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 564 v += 36; 565 } 566 xb += 6; 567 ai++; 568 } 569 570 PetscCall(VecRestoreArrayRead(xx, &x)); 571 PetscCall(VecRestoreArray(zz, &z)); 572 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 573 PetscFunctionReturn(0); 574 } 575 576 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 577 { 578 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 579 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 580 const PetscScalar *x, *xb; 581 const MatScalar *v; 582 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 583 const PetscInt *aj = a->j, *ai = a->i, *ib; 584 PetscInt nonzerorow = 0; 585 586 PetscFunctionBegin; 587 PetscCall(VecSet(zz, zero)); 588 if (!a->nz) PetscFunctionReturn(0); 589 PetscCall(VecGetArrayRead(xx, &x)); 590 PetscCall(VecGetArray(zz, &z)); 591 592 v = a->a; 593 xb = x; 594 595 for (i = 0; i < mbs; i++) { 596 n = ai[1] - ai[0]; /* length of i_th block row of A */ 597 x1 = xb[0]; 598 x2 = xb[1]; 599 x3 = xb[2]; 600 x4 = xb[3]; 601 x5 = xb[4]; 602 x6 = xb[5]; 603 x7 = xb[6]; 604 ib = aj + *ai; 605 jmin = 0; 606 nonzerorow += (n > 0); 607 if (*ib == i) { /* (diag of A)*x */ 608 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 609 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 610 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 611 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 612 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 613 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 614 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 615 v += 49; 616 jmin++; 617 } 618 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 619 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 620 for (j = jmin; j < n; j++) { 621 /* (strict lower triangular part of A)*x */ 622 cval = ib[j] * 7; 623 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 624 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 625 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 626 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 627 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 628 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 629 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 630 /* (strict upper triangular part of A)*x */ 631 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 632 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 633 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 634 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 635 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 636 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 637 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 638 v += 49; 639 } 640 xb += 7; 641 ai++; 642 } 643 PetscCall(VecRestoreArrayRead(xx, &x)); 644 PetscCall(VecRestoreArray(zz, &z)); 645 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 646 PetscFunctionReturn(0); 647 } 648 649 /* 650 This will not work with MatScalar == float because it calls the BLAS 651 */ 652 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 653 { 654 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 655 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 656 const PetscScalar *x, *x_ptr, *xb; 657 const MatScalar *v; 658 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 659 const PetscInt *idx, *aj, *ii; 660 PetscInt nonzerorow = 0; 661 662 PetscFunctionBegin; 663 PetscCall(VecSet(zz, zero)); 664 if (!a->nz) PetscFunctionReturn(0); 665 PetscCall(VecGetArrayRead(xx, &x)); 666 PetscCall(VecGetArray(zz, &z)); 667 668 x_ptr = x; 669 z_ptr = z; 670 671 aj = a->j; 672 v = a->a; 673 ii = a->i; 674 675 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 676 work = a->mult_work; 677 678 for (i = 0; i < mbs; i++) { 679 n = ii[1] - ii[0]; 680 ncols = n * bs; 681 workt = work; 682 idx = aj + ii[0]; 683 nonzerorow += (n > 0); 684 685 /* upper triangular part */ 686 for (j = 0; j < n; j++) { 687 xb = x_ptr + bs * (*idx++); 688 for (k = 0; k < bs; k++) workt[k] = xb[k]; 689 workt += bs; 690 } 691 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 692 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 693 694 /* strict lower triangular part */ 695 idx = aj + ii[0]; 696 if (n && *idx == i) { 697 ncols -= bs; 698 v += bs2; 699 idx++; 700 n--; 701 } 702 703 if (ncols > 0) { 704 workt = work; 705 PetscCall(PetscArrayzero(workt, ncols)); 706 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 707 for (j = 0; j < n; j++) { 708 zb = z_ptr + bs * (*idx++); 709 for (k = 0; k < bs; k++) zb[k] += workt[k]; 710 workt += bs; 711 } 712 } 713 x += bs; 714 v += n * bs2; 715 z += bs; 716 ii++; 717 } 718 719 PetscCall(VecRestoreArrayRead(xx, &x)); 720 PetscCall(VecRestoreArray(zz, &z)); 721 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 722 PetscFunctionReturn(0); 723 } 724 725 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 726 { 727 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 728 PetscScalar *z, x1; 729 const PetscScalar *x, *xb; 730 const MatScalar *v; 731 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 732 const PetscInt *aj = a->j, *ai = a->i, *ib; 733 PetscInt nonzerorow = 0; 734 #if defined(PETSC_USE_COMPLEX) 735 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 736 #else 737 const int aconj = 0; 738 #endif 739 740 PetscFunctionBegin; 741 PetscCall(VecCopy(yy, zz)); 742 if (!a->nz) PetscFunctionReturn(0); 743 PetscCall(VecGetArrayRead(xx, &x)); 744 PetscCall(VecGetArray(zz, &z)); 745 v = a->a; 746 xb = x; 747 748 for (i = 0; i < mbs; i++) { 749 n = ai[1] - ai[0]; /* length of i_th row of A */ 750 x1 = xb[0]; 751 ib = aj + *ai; 752 jmin = 0; 753 nonzerorow += (n > 0); 754 if (n && *ib == i) { /* (diag of A)*x */ 755 z[i] += *v++ * x[*ib++]; 756 jmin++; 757 } 758 if (aconj) { 759 for (j = jmin; j < n; j++) { 760 cval = *ib; 761 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 762 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 763 } 764 } else { 765 for (j = jmin; j < n; j++) { 766 cval = *ib; 767 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 768 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 769 } 770 } 771 xb++; 772 ai++; 773 } 774 775 PetscCall(VecRestoreArrayRead(xx, &x)); 776 PetscCall(VecRestoreArray(zz, &z)); 777 778 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 779 PetscFunctionReturn(0); 780 } 781 782 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 783 { 784 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 785 PetscScalar *z, x1, x2; 786 const PetscScalar *x, *xb; 787 const MatScalar *v; 788 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 789 const PetscInt *aj = a->j, *ai = a->i, *ib; 790 PetscInt nonzerorow = 0; 791 792 PetscFunctionBegin; 793 PetscCall(VecCopy(yy, zz)); 794 if (!a->nz) PetscFunctionReturn(0); 795 PetscCall(VecGetArrayRead(xx, &x)); 796 PetscCall(VecGetArray(zz, &z)); 797 798 v = a->a; 799 xb = x; 800 801 for (i = 0; i < mbs; i++) { 802 n = ai[1] - ai[0]; /* length of i_th block row of A */ 803 x1 = xb[0]; 804 x2 = xb[1]; 805 ib = aj + *ai; 806 jmin = 0; 807 nonzerorow += (n > 0); 808 if (n && *ib == i) { /* (diag of A)*x */ 809 z[2 * i] += v[0] * x1 + v[2] * x2; 810 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 811 v += 4; 812 jmin++; 813 } 814 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 815 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 816 for (j = jmin; j < n; j++) { 817 /* (strict lower triangular part of A)*x */ 818 cval = ib[j] * 2; 819 z[cval] += v[0] * x1 + v[1] * x2; 820 z[cval + 1] += v[2] * x1 + v[3] * x2; 821 /* (strict upper triangular part of A)*x */ 822 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 823 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 824 v += 4; 825 } 826 xb += 2; 827 ai++; 828 } 829 PetscCall(VecRestoreArrayRead(xx, &x)); 830 PetscCall(VecRestoreArray(zz, &z)); 831 832 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 833 PetscFunctionReturn(0); 834 } 835 836 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 837 { 838 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 839 PetscScalar *z, x1, x2, x3; 840 const PetscScalar *x, *xb; 841 const MatScalar *v; 842 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 843 const PetscInt *aj = a->j, *ai = a->i, *ib; 844 PetscInt nonzerorow = 0; 845 846 PetscFunctionBegin; 847 PetscCall(VecCopy(yy, zz)); 848 if (!a->nz) PetscFunctionReturn(0); 849 PetscCall(VecGetArrayRead(xx, &x)); 850 PetscCall(VecGetArray(zz, &z)); 851 852 v = a->a; 853 xb = x; 854 855 for (i = 0; i < mbs; i++) { 856 n = ai[1] - ai[0]; /* length of i_th block row of A */ 857 x1 = xb[0]; 858 x2 = xb[1]; 859 x3 = xb[2]; 860 ib = aj + *ai; 861 jmin = 0; 862 nonzerorow += (n > 0); 863 if (n && *ib == i) { /* (diag of A)*x */ 864 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 865 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 866 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 867 v += 9; 868 jmin++; 869 } 870 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 871 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 872 for (j = jmin; j < n; j++) { 873 /* (strict lower triangular part of A)*x */ 874 cval = ib[j] * 3; 875 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 876 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 877 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 878 /* (strict upper triangular part of A)*x */ 879 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 880 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 881 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 882 v += 9; 883 } 884 xb += 3; 885 ai++; 886 } 887 888 PetscCall(VecRestoreArrayRead(xx, &x)); 889 PetscCall(VecRestoreArray(zz, &z)); 890 891 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 896 { 897 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 898 PetscScalar *z, x1, x2, x3, x4; 899 const PetscScalar *x, *xb; 900 const MatScalar *v; 901 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 902 const PetscInt *aj = a->j, *ai = a->i, *ib; 903 PetscInt nonzerorow = 0; 904 905 PetscFunctionBegin; 906 PetscCall(VecCopy(yy, zz)); 907 if (!a->nz) PetscFunctionReturn(0); 908 PetscCall(VecGetArrayRead(xx, &x)); 909 PetscCall(VecGetArray(zz, &z)); 910 911 v = a->a; 912 xb = x; 913 914 for (i = 0; i < mbs; i++) { 915 n = ai[1] - ai[0]; /* length of i_th block row of A */ 916 x1 = xb[0]; 917 x2 = xb[1]; 918 x3 = xb[2]; 919 x4 = xb[3]; 920 ib = aj + *ai; 921 jmin = 0; 922 nonzerorow += (n > 0); 923 if (n && *ib == i) { /* (diag of A)*x */ 924 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 925 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 926 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 927 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 928 v += 16; 929 jmin++; 930 } 931 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 932 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 933 for (j = jmin; j < n; j++) { 934 /* (strict lower triangular part of A)*x */ 935 cval = ib[j] * 4; 936 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 937 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 938 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 939 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 940 /* (strict upper triangular part of A)*x */ 941 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 942 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 943 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 944 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 945 v += 16; 946 } 947 xb += 4; 948 ai++; 949 } 950 951 PetscCall(VecRestoreArrayRead(xx, &x)); 952 PetscCall(VecRestoreArray(zz, &z)); 953 954 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 955 PetscFunctionReturn(0); 956 } 957 958 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 959 { 960 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 961 PetscScalar *z, x1, x2, x3, x4, x5; 962 const PetscScalar *x, *xb; 963 const MatScalar *v; 964 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 965 const PetscInt *aj = a->j, *ai = a->i, *ib; 966 PetscInt nonzerorow = 0; 967 968 PetscFunctionBegin; 969 PetscCall(VecCopy(yy, zz)); 970 if (!a->nz) PetscFunctionReturn(0); 971 PetscCall(VecGetArrayRead(xx, &x)); 972 PetscCall(VecGetArray(zz, &z)); 973 974 v = a->a; 975 xb = x; 976 977 for (i = 0; i < mbs; i++) { 978 n = ai[1] - ai[0]; /* length of i_th block row of A */ 979 x1 = xb[0]; 980 x2 = xb[1]; 981 x3 = xb[2]; 982 x4 = xb[3]; 983 x5 = xb[4]; 984 ib = aj + *ai; 985 jmin = 0; 986 nonzerorow += (n > 0); 987 if (n && *ib == i) { /* (diag of A)*x */ 988 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 989 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 990 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 991 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 992 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 993 v += 25; 994 jmin++; 995 } 996 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 997 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 998 for (j = jmin; j < n; j++) { 999 /* (strict lower triangular part of A)*x */ 1000 cval = ib[j] * 5; 1001 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 1002 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1003 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1004 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1005 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1006 /* (strict upper triangular part of A)*x */ 1007 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1008 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1009 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1010 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1011 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1012 v += 25; 1013 } 1014 xb += 5; 1015 ai++; 1016 } 1017 1018 PetscCall(VecRestoreArrayRead(xx, &x)); 1019 PetscCall(VecRestoreArray(zz, &z)); 1020 1021 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1022 PetscFunctionReturn(0); 1023 } 1024 1025 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1026 { 1027 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1028 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1029 const PetscScalar *x, *xb; 1030 const MatScalar *v; 1031 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1032 const PetscInt *aj = a->j, *ai = a->i, *ib; 1033 PetscInt nonzerorow = 0; 1034 1035 PetscFunctionBegin; 1036 PetscCall(VecCopy(yy, zz)); 1037 if (!a->nz) PetscFunctionReturn(0); 1038 PetscCall(VecGetArrayRead(xx, &x)); 1039 PetscCall(VecGetArray(zz, &z)); 1040 1041 v = a->a; 1042 xb = x; 1043 1044 for (i = 0; i < mbs; i++) { 1045 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1046 x1 = xb[0]; 1047 x2 = xb[1]; 1048 x3 = xb[2]; 1049 x4 = xb[3]; 1050 x5 = xb[4]; 1051 x6 = xb[5]; 1052 ib = aj + *ai; 1053 jmin = 0; 1054 nonzerorow += (n > 0); 1055 if (n && *ib == i) { /* (diag of A)*x */ 1056 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1057 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1058 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1059 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1060 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1061 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1062 v += 36; 1063 jmin++; 1064 } 1065 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1066 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1067 for (j = jmin; j < n; j++) { 1068 /* (strict lower triangular part of A)*x */ 1069 cval = ib[j] * 6; 1070 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1071 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1072 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1073 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1074 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1075 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1076 /* (strict upper triangular part of A)*x */ 1077 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1078 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1079 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1080 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1081 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1082 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1083 v += 36; 1084 } 1085 xb += 6; 1086 ai++; 1087 } 1088 1089 PetscCall(VecRestoreArrayRead(xx, &x)); 1090 PetscCall(VecRestoreArray(zz, &z)); 1091 1092 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1093 PetscFunctionReturn(0); 1094 } 1095 1096 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1097 { 1098 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1099 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1100 const PetscScalar *x, *xb; 1101 const MatScalar *v; 1102 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1103 const PetscInt *aj = a->j, *ai = a->i, *ib; 1104 PetscInt nonzerorow = 0; 1105 1106 PetscFunctionBegin; 1107 PetscCall(VecCopy(yy, zz)); 1108 if (!a->nz) PetscFunctionReturn(0); 1109 PetscCall(VecGetArrayRead(xx, &x)); 1110 PetscCall(VecGetArray(zz, &z)); 1111 1112 v = a->a; 1113 xb = x; 1114 1115 for (i = 0; i < mbs; i++) { 1116 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1117 x1 = xb[0]; 1118 x2 = xb[1]; 1119 x3 = xb[2]; 1120 x4 = xb[3]; 1121 x5 = xb[4]; 1122 x6 = xb[5]; 1123 x7 = xb[6]; 1124 ib = aj + *ai; 1125 jmin = 0; 1126 nonzerorow += (n > 0); 1127 if (n && *ib == i) { /* (diag of A)*x */ 1128 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1129 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1130 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1131 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1132 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1133 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1134 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1135 v += 49; 1136 jmin++; 1137 } 1138 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1139 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1140 for (j = jmin; j < n; j++) { 1141 /* (strict lower triangular part of A)*x */ 1142 cval = ib[j] * 7; 1143 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1144 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1145 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1146 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1147 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1148 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1149 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1150 /* (strict upper triangular part of A)*x */ 1151 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1152 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1153 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1154 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1155 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1156 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1157 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1158 v += 49; 1159 } 1160 xb += 7; 1161 ai++; 1162 } 1163 1164 PetscCall(VecRestoreArrayRead(xx, &x)); 1165 PetscCall(VecRestoreArray(zz, &z)); 1166 1167 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1168 PetscFunctionReturn(0); 1169 } 1170 1171 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1172 { 1173 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1174 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1175 const PetscScalar *x, *x_ptr, *xb; 1176 const MatScalar *v; 1177 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1178 const PetscInt *idx, *aj, *ii; 1179 PetscInt nonzerorow = 0; 1180 1181 PetscFunctionBegin; 1182 PetscCall(VecCopy(yy, zz)); 1183 if (!a->nz) PetscFunctionReturn(0); 1184 PetscCall(VecGetArrayRead(xx, &x)); 1185 x_ptr = x; 1186 PetscCall(VecGetArray(zz, &z)); 1187 z_ptr = z; 1188 1189 aj = a->j; 1190 v = a->a; 1191 ii = a->i; 1192 1193 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1194 work = a->mult_work; 1195 1196 for (i = 0; i < mbs; i++) { 1197 n = ii[1] - ii[0]; 1198 ncols = n * bs; 1199 workt = work; 1200 idx = aj + ii[0]; 1201 nonzerorow += (n > 0); 1202 1203 /* upper triangular part */ 1204 for (j = 0; j < n; j++) { 1205 xb = x_ptr + bs * (*idx++); 1206 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1207 workt += bs; 1208 } 1209 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1210 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1211 1212 /* strict lower triangular part */ 1213 idx = aj + ii[0]; 1214 if (n && *idx == i) { 1215 ncols -= bs; 1216 v += bs2; 1217 idx++; 1218 n--; 1219 } 1220 if (ncols > 0) { 1221 workt = work; 1222 PetscCall(PetscArrayzero(workt, ncols)); 1223 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1224 for (j = 0; j < n; j++) { 1225 zb = z_ptr + bs * (*idx++); 1226 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1227 workt += bs; 1228 } 1229 } 1230 1231 x += bs; 1232 v += n * bs2; 1233 z += bs; 1234 ii++; 1235 } 1236 1237 PetscCall(VecRestoreArrayRead(xx, &x)); 1238 PetscCall(VecRestoreArray(zz, &z)); 1239 1240 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1241 PetscFunctionReturn(0); 1242 } 1243 1244 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1245 { 1246 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1247 PetscScalar oalpha = alpha; 1248 PetscBLASInt one = 1, totalnz; 1249 1250 PetscFunctionBegin; 1251 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1252 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1253 PetscCall(PetscLogFlops(totalnz)); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1258 { 1259 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1260 const MatScalar *v = a->a; 1261 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1262 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1263 const PetscInt *aj = a->j, *col; 1264 1265 PetscFunctionBegin; 1266 if (!a->nz) { 1267 *norm = 0.0; 1268 PetscFunctionReturn(0); 1269 } 1270 if (type == NORM_FROBENIUS) { 1271 for (k = 0; k < mbs; k++) { 1272 jmin = a->i[k]; 1273 jmax = a->i[k + 1]; 1274 col = aj + jmin; 1275 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1276 for (i = 0; i < bs2; i++) { 1277 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1278 v++; 1279 } 1280 jmin++; 1281 } 1282 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1283 for (i = 0; i < bs2; i++) { 1284 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1285 v++; 1286 } 1287 } 1288 } 1289 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1290 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1291 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1292 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1293 for (i = 0; i < mbs; i++) jl[i] = mbs; 1294 il[0] = 0; 1295 1296 *norm = 0.0; 1297 for (k = 0; k < mbs; k++) { /* k_th block row */ 1298 for (j = 0; j < bs; j++) sum[j] = 0.0; 1299 /*-- col sum --*/ 1300 i = jl[k]; /* first |A(i,k)| to be added */ 1301 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1302 at step k */ 1303 while (i < mbs) { 1304 nexti = jl[i]; /* next block row to be added */ 1305 ik = il[i]; /* block index of A(i,k) in the array a */ 1306 for (j = 0; j < bs; j++) { 1307 v = a->a + ik * bs2 + j * bs; 1308 for (k1 = 0; k1 < bs; k1++) { 1309 sum[j] += PetscAbsScalar(*v); 1310 v++; 1311 } 1312 } 1313 /* update il, jl */ 1314 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1315 jmax = a->i[i + 1]; 1316 if (jmin < jmax) { 1317 il[i] = jmin; 1318 j = a->j[jmin]; 1319 jl[i] = jl[j]; 1320 jl[j] = i; 1321 } 1322 i = nexti; 1323 } 1324 /*-- row sum --*/ 1325 jmin = a->i[k]; 1326 jmax = a->i[k + 1]; 1327 for (i = jmin; i < jmax; i++) { 1328 for (j = 0; j < bs; j++) { 1329 v = a->a + i * bs2 + j; 1330 for (k1 = 0; k1 < bs; k1++) { 1331 sum[j] += PetscAbsScalar(*v); 1332 v += bs; 1333 } 1334 } 1335 } 1336 /* add k_th block row to il, jl */ 1337 col = aj + jmin; 1338 if (jmax - jmin > 0 && *col == k) jmin++; 1339 if (jmin < jmax) { 1340 il[k] = jmin; 1341 j = a->j[jmin]; 1342 jl[k] = jl[j]; 1343 jl[j] = k; 1344 } 1345 for (j = 0; j < bs; j++) { 1346 if (sum[j] > *norm) *norm = sum[j]; 1347 } 1348 } 1349 PetscCall(PetscFree3(sum, il, jl)); 1350 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1351 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1352 PetscFunctionReturn(0); 1353 } 1354 1355 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1356 { 1357 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1358 1359 PetscFunctionBegin; 1360 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1361 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1362 *flg = PETSC_FALSE; 1363 PetscFunctionReturn(0); 1364 } 1365 1366 /* if the a->i are the same */ 1367 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1368 if (!*flg) PetscFunctionReturn(0); 1369 1370 /* if a->j are the same */ 1371 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1372 if (!*flg) PetscFunctionReturn(0); 1373 1374 /* if a->a are the same */ 1375 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1376 PetscFunctionReturn(0); 1377 } 1378 1379 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1380 { 1381 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1382 PetscInt i, j, k, row, bs, ambs, bs2; 1383 const PetscInt *ai, *aj; 1384 PetscScalar *x, zero = 0.0; 1385 const MatScalar *aa, *aa_j; 1386 1387 PetscFunctionBegin; 1388 bs = A->rmap->bs; 1389 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1390 1391 aa = a->a; 1392 ambs = a->mbs; 1393 1394 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1395 PetscInt *diag = a->diag; 1396 aa = a->a; 1397 ambs = a->mbs; 1398 PetscCall(VecGetArray(v, &x)); 1399 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1400 PetscCall(VecRestoreArray(v, &x)); 1401 PetscFunctionReturn(0); 1402 } 1403 1404 ai = a->i; 1405 aj = a->j; 1406 bs2 = a->bs2; 1407 PetscCall(VecSet(v, zero)); 1408 if (!a->nz) PetscFunctionReturn(0); 1409 PetscCall(VecGetArray(v, &x)); 1410 for (i = 0; i < ambs; i++) { 1411 j = ai[i]; 1412 if (aj[j] == i) { /* if this is a diagonal element */ 1413 row = i * bs; 1414 aa_j = aa + j * bs2; 1415 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1416 } 1417 } 1418 PetscCall(VecRestoreArray(v, &x)); 1419 PetscFunctionReturn(0); 1420 } 1421 1422 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1423 { 1424 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1425 PetscScalar x; 1426 const PetscScalar *l, *li, *ri; 1427 MatScalar *aa, *v; 1428 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1429 const PetscInt *ai, *aj; 1430 PetscBool flg; 1431 1432 PetscFunctionBegin; 1433 if (ll != rr) { 1434 PetscCall(VecEqual(ll, rr, &flg)); 1435 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1436 } 1437 if (!ll) PetscFunctionReturn(0); 1438 ai = a->i; 1439 aj = a->j; 1440 aa = a->a; 1441 m = A->rmap->N; 1442 bs = A->rmap->bs; 1443 mbs = a->mbs; 1444 bs2 = a->bs2; 1445 1446 PetscCall(VecGetArrayRead(ll, &l)); 1447 PetscCall(VecGetLocalSize(ll, &lm)); 1448 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1449 for (i = 0; i < mbs; i++) { /* for each block row */ 1450 M = ai[i + 1] - ai[i]; 1451 li = l + i * bs; 1452 v = aa + bs2 * ai[i]; 1453 for (j = 0; j < M; j++) { /* for each block */ 1454 ri = l + bs * aj[ai[i] + j]; 1455 for (k = 0; k < bs; k++) { 1456 x = ri[k]; 1457 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1458 } 1459 } 1460 } 1461 PetscCall(VecRestoreArrayRead(ll, &l)); 1462 PetscCall(PetscLogFlops(2.0 * a->nz)); 1463 PetscFunctionReturn(0); 1464 } 1465 1466 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1467 { 1468 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1469 1470 PetscFunctionBegin; 1471 info->block_size = a->bs2; 1472 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1473 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1474 info->nz_unneeded = info->nz_allocated - info->nz_used; 1475 info->assemblies = A->num_ass; 1476 info->mallocs = A->info.mallocs; 1477 info->memory = 0; /* REVIEW ME */ 1478 if (A->factortype) { 1479 info->fill_ratio_given = A->info.fill_ratio_given; 1480 info->fill_ratio_needed = A->info.fill_ratio_needed; 1481 info->factor_mallocs = A->info.factor_mallocs; 1482 } else { 1483 info->fill_ratio_given = 0; 1484 info->fill_ratio_needed = 0; 1485 info->factor_mallocs = 0; 1486 } 1487 PetscFunctionReturn(0); 1488 } 1489 1490 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1491 { 1492 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1493 1494 PetscFunctionBegin; 1495 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1496 PetscFunctionReturn(0); 1497 } 1498 1499 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1500 { 1501 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1502 PetscInt i, j, n, row, col, bs, mbs; 1503 const PetscInt *ai, *aj; 1504 PetscReal atmp; 1505 const MatScalar *aa; 1506 PetscScalar *x; 1507 PetscInt ncols, brow, bcol, krow, kcol; 1508 1509 PetscFunctionBegin; 1510 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1511 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1512 bs = A->rmap->bs; 1513 aa = a->a; 1514 ai = a->i; 1515 aj = a->j; 1516 mbs = a->mbs; 1517 1518 PetscCall(VecSet(v, 0.0)); 1519 PetscCall(VecGetArray(v, &x)); 1520 PetscCall(VecGetLocalSize(v, &n)); 1521 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1522 for (i = 0; i < mbs; i++) { 1523 ncols = ai[1] - ai[0]; 1524 ai++; 1525 brow = bs * i; 1526 for (j = 0; j < ncols; j++) { 1527 bcol = bs * (*aj); 1528 for (kcol = 0; kcol < bs; kcol++) { 1529 col = bcol + kcol; /* col index */ 1530 for (krow = 0; krow < bs; krow++) { 1531 atmp = PetscAbsScalar(*aa); 1532 aa++; 1533 row = brow + krow; /* row index */ 1534 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1535 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1536 } 1537 } 1538 aj++; 1539 } 1540 } 1541 PetscCall(VecRestoreArray(v, &x)); 1542 PetscFunctionReturn(0); 1543 } 1544 1545 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1546 { 1547 PetscFunctionBegin; 1548 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1549 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1550 PetscFunctionReturn(0); 1551 } 1552 1553 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1554 { 1555 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1556 PetscScalar *z = c; 1557 const PetscScalar *xb; 1558 PetscScalar x1; 1559 const MatScalar *v = a->a, *vv; 1560 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1561 #if defined(PETSC_USE_COMPLEX) 1562 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1563 #else 1564 const int aconj = 0; 1565 #endif 1566 1567 PetscFunctionBegin; 1568 for (i = 0; i < mbs; i++) { 1569 n = ii[1] - ii[0]; 1570 ii++; 1571 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1572 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1573 jj = idx; 1574 vv = v; 1575 for (k = 0; k < cn; k++) { 1576 idx = jj; 1577 v = vv; 1578 for (j = 0; j < n; j++) { 1579 xb = b + (*idx); 1580 x1 = xb[0 + k * bm]; 1581 z[0 + k * cm] += v[0] * x1; 1582 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1583 v += 1; 1584 ++idx; 1585 } 1586 } 1587 z += 1; 1588 } 1589 PetscFunctionReturn(0); 1590 } 1591 1592 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1593 { 1594 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1595 PetscScalar *z = c; 1596 const PetscScalar *xb; 1597 PetscScalar x1, x2; 1598 const MatScalar *v = a->a, *vv; 1599 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1600 1601 PetscFunctionBegin; 1602 for (i = 0; i < mbs; i++) { 1603 n = ii[1] - ii[0]; 1604 ii++; 1605 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1606 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1607 jj = idx; 1608 vv = v; 1609 for (k = 0; k < cn; k++) { 1610 idx = jj; 1611 v = vv; 1612 for (j = 0; j < n; j++) { 1613 xb = b + 2 * (*idx); 1614 x1 = xb[0 + k * bm]; 1615 x2 = xb[1 + k * bm]; 1616 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1617 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1618 if (*idx != i) { 1619 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1620 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1621 } 1622 v += 4; 1623 ++idx; 1624 } 1625 } 1626 z += 2; 1627 } 1628 PetscFunctionReturn(0); 1629 } 1630 1631 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1632 { 1633 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1634 PetscScalar *z = c; 1635 const PetscScalar *xb; 1636 PetscScalar x1, x2, x3; 1637 const MatScalar *v = a->a, *vv; 1638 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1639 1640 PetscFunctionBegin; 1641 for (i = 0; i < mbs; i++) { 1642 n = ii[1] - ii[0]; 1643 ii++; 1644 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1645 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1646 jj = idx; 1647 vv = v; 1648 for (k = 0; k < cn; k++) { 1649 idx = jj; 1650 v = vv; 1651 for (j = 0; j < n; j++) { 1652 xb = b + 3 * (*idx); 1653 x1 = xb[0 + k * bm]; 1654 x2 = xb[1 + k * bm]; 1655 x3 = xb[2 + k * bm]; 1656 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1657 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1658 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1659 if (*idx != i) { 1660 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1661 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1662 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1663 } 1664 v += 9; 1665 ++idx; 1666 } 1667 } 1668 z += 3; 1669 } 1670 PetscFunctionReturn(0); 1671 } 1672 1673 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1674 { 1675 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1676 PetscScalar *z = c; 1677 const PetscScalar *xb; 1678 PetscScalar x1, x2, x3, x4; 1679 const MatScalar *v = a->a, *vv; 1680 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1681 1682 PetscFunctionBegin; 1683 for (i = 0; i < mbs; i++) { 1684 n = ii[1] - ii[0]; 1685 ii++; 1686 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1687 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1688 jj = idx; 1689 vv = v; 1690 for (k = 0; k < cn; k++) { 1691 idx = jj; 1692 v = vv; 1693 for (j = 0; j < n; j++) { 1694 xb = b + 4 * (*idx); 1695 x1 = xb[0 + k * bm]; 1696 x2 = xb[1 + k * bm]; 1697 x3 = xb[2 + k * bm]; 1698 x4 = xb[3 + k * bm]; 1699 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1700 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1701 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1702 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1703 if (*idx != i) { 1704 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1705 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1706 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1707 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1708 } 1709 v += 16; 1710 ++idx; 1711 } 1712 } 1713 z += 4; 1714 } 1715 PetscFunctionReturn(0); 1716 } 1717 1718 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1719 { 1720 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1721 PetscScalar *z = c; 1722 const PetscScalar *xb; 1723 PetscScalar x1, x2, x3, x4, x5; 1724 const MatScalar *v = a->a, *vv; 1725 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1726 1727 PetscFunctionBegin; 1728 for (i = 0; i < mbs; i++) { 1729 n = ii[1] - ii[0]; 1730 ii++; 1731 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1732 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1733 jj = idx; 1734 vv = v; 1735 for (k = 0; k < cn; k++) { 1736 idx = jj; 1737 v = vv; 1738 for (j = 0; j < n; j++) { 1739 xb = b + 5 * (*idx); 1740 x1 = xb[0 + k * bm]; 1741 x2 = xb[1 + k * bm]; 1742 x3 = xb[2 + k * bm]; 1743 x4 = xb[3 + k * bm]; 1744 x5 = xb[4 + k * cm]; 1745 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1746 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1747 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1748 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1749 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1750 if (*idx != i) { 1751 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1752 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1753 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1754 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1755 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1756 } 1757 v += 25; 1758 ++idx; 1759 } 1760 } 1761 z += 5; 1762 } 1763 PetscFunctionReturn(0); 1764 } 1765 1766 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1767 { 1768 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1769 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1770 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1771 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1772 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1773 PetscBLASInt bbs, bcn, bbm, bcm; 1774 PetscScalar *z = NULL; 1775 PetscScalar *c, *b; 1776 const MatScalar *v; 1777 const PetscInt *idx, *ii; 1778 PetscScalar _DOne = 1.0; 1779 1780 PetscFunctionBegin; 1781 if (!cm || !cn) PetscFunctionReturn(0); 1782 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1783 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1784 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1785 b = bd->v; 1786 PetscCall(MatZeroEntries(C)); 1787 PetscCall(MatDenseGetArray(C, &c)); 1788 switch (bs) { 1789 case 1: 1790 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1791 break; 1792 case 2: 1793 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1794 break; 1795 case 3: 1796 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1797 break; 1798 case 4: 1799 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1800 break; 1801 case 5: 1802 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1803 break; 1804 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1805 PetscCall(PetscBLASIntCast(bs, &bbs)); 1806 PetscCall(PetscBLASIntCast(cn, &bcn)); 1807 PetscCall(PetscBLASIntCast(bm, &bbm)); 1808 PetscCall(PetscBLASIntCast(cm, &bcm)); 1809 idx = a->j; 1810 v = a->a; 1811 mbs = a->mbs; 1812 ii = a->i; 1813 z = c; 1814 for (i = 0; i < mbs; i++) { 1815 n = ii[1] - ii[0]; 1816 ii++; 1817 for (j = 0; j < n; j++) { 1818 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1819 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1820 v += bs2; 1821 } 1822 z += bs; 1823 } 1824 } 1825 PetscCall(MatDenseRestoreArray(C, &c)); 1826 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1827 PetscFunctionReturn(0); 1828 } 1829