1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 13 const PetscInt *idx; 14 PetscBT table_out, table_in; 15 16 PetscFunctionBegin; 17 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 PetscCall(PetscBTCreate(mbs, &table_out)); 23 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 24 PetscCall(PetscBTCreate(mbs, &table_in)); 25 26 for (i = 0; i < is_max; i++) { /* for each is */ 27 isz = 0; 28 PetscCall(PetscBTMemzero(mbs, table_out)); 29 30 /* Extract the indices, assume there can be duplicate entries */ 31 PetscCall(ISGetIndices(is[i], &idx)); 32 PetscCall(ISGetLocalSize(is[i], &n)); 33 34 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35 bcol_max = 0; 36 for (j = 0; j < n; ++j) { 37 brow = idx[j] / bs; /* convert the indices into block indices */ 38 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39 if (!PetscBTLookupSet(table_out, brow)) { 40 nidx[isz++] = brow; 41 if (bcol_max < brow) bcol_max = brow; 42 } 43 } 44 PetscCall(ISRestoreIndices(is[i], &idx)); 45 PetscCall(ISDestroy(&is[i])); 46 47 k = 0; 48 for (j = 0; j < ov; j++) { /* for each overlap */ 49 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 50 PetscCall(PetscBTMemzero(mbs, table_in)); 51 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52 53 n = isz; /* length of the updated is[i] */ 54 for (brow = 0; brow < mbs; brow++) { 55 start = ai[brow]; 56 end = ai[brow + 1]; 57 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l < end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out, bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l < end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in, bcol)) { 72 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 80 } /* for each is */ 81 PetscCall(PetscBTDestroy(&table_out)); 82 PetscCall(PetscFree(nidx)); 83 PetscCall(PetscBTDestroy(&table_in)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 88 Zero some ops' to avoid invalid use */ 89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90 { 91 PetscFunctionBegin; 92 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93 Bseq->ops->mult = NULL; 94 Bseq->ops->multadd = NULL; 95 Bseq->ops->multtranspose = NULL; 96 Bseq->ops->multtransposeadd = NULL; 97 Bseq->ops->lufactor = NULL; 98 Bseq->ops->choleskyfactor = NULL; 99 Bseq->ops->lufactorsymbolic = NULL; 100 Bseq->ops->choleskyfactorsymbolic = NULL; 101 Bseq->ops->getinertia = NULL; 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 107 { 108 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111 const PetscInt *irow, *icol; 112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113 PetscInt *aj = a->j, *ai = a->i; 114 MatScalar *mat_a; 115 Mat C; 116 PetscBool flag; 117 118 PetscFunctionBegin; 119 120 PetscCall(ISGetIndices(isrow, &irow)); 121 PetscCall(ISGetIndices(iscol, &icol)); 122 PetscCall(ISGetLocalSize(isrow, &nrows)); 123 PetscCall(ISGetLocalSize(iscol, &ncols)); 124 125 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126 ssmap = smap; 127 PetscCall(PetscMalloc1(1 + nrows, &lens)); 128 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129 /* determine lens of each row */ 130 for (i = 0; i < nrows; i++) { 131 kstart = ai[irow[i]]; 132 kend = kstart + a->ilen[irow[i]]; 133 lens[i] = 0; 134 for (k = kstart; k < kend; k++) { 135 if (ssmap[aj[k]]) lens[i]++; 136 } 137 } 138 /* Create and fill new matrix */ 139 if (scall == MAT_REUSE_MATRIX) { 140 c = (Mat_SeqSBAIJ *)((*B)->data); 141 142 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 143 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 144 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 145 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146 C = *B; 147 } else { 148 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 149 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 150 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 151 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 152 } 153 c = (Mat_SeqSBAIJ *)(C->data); 154 for (i = 0; i < nrows; i++) { 155 row = irow[i]; 156 kstart = ai[row]; 157 kend = kstart + a->ilen[row]; 158 mat_i = c->i[i]; 159 mat_j = c->j + mat_i; 160 mat_a = c->a + mat_i * bs2; 161 mat_ilen = c->ilen + i; 162 for (k = kstart; k < kend; k++) { 163 if ((tcol = ssmap[a->j[k]])) { 164 *mat_j++ = tcol - 1; 165 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 166 mat_a += bs2; 167 (*mat_ilen)++; 168 } 169 } 170 } 171 /* sort */ 172 { 173 MatScalar *work; 174 175 PetscCall(PetscMalloc1(bs2, &work)); 176 for (i = 0; i < nrows; i++) { 177 PetscInt ilen; 178 mat_i = c->i[i]; 179 mat_j = c->j + mat_i; 180 mat_a = c->a + mat_i * bs2; 181 ilen = c->ilen[i]; 182 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 183 } 184 PetscCall(PetscFree(work)); 185 } 186 187 /* Free work space */ 188 PetscCall(ISRestoreIndices(iscol, &icol)); 189 PetscCall(PetscFree(smap)); 190 PetscCall(PetscFree(lens)); 191 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 192 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 193 194 PetscCall(ISRestoreIndices(isrow, &irow)); 195 *B = C; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 200 { 201 IS is1, is2; 202 203 PetscFunctionBegin; 204 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 205 if (isrow == iscol) { 206 is2 = is1; 207 PetscCall(PetscObjectReference((PetscObject)is2)); 208 } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 209 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 210 PetscCall(ISDestroy(&is1)); 211 PetscCall(ISDestroy(&is2)); 212 213 if (isrow != iscol) { 214 PetscBool isequal; 215 PetscCall(ISEqual(isrow, iscol, &isequal)); 216 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 217 } 218 PetscFunctionReturn(PETSC_SUCCESS); 219 } 220 221 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 222 { 223 PetscInt i; 224 225 PetscFunctionBegin; 226 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 227 228 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 229 PetscFunctionReturn(PETSC_SUCCESS); 230 } 231 232 /* Should check that shapes of vectors and matrices match */ 233 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 234 { 235 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 236 PetscScalar *z, x1, x2, zero = 0.0; 237 const PetscScalar *x, *xb; 238 const MatScalar *v; 239 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 240 const PetscInt *aj = a->j, *ai = a->i, *ib; 241 PetscInt nonzerorow = 0; 242 243 PetscFunctionBegin; 244 PetscCall(VecSet(zz, zero)); 245 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 246 PetscCall(VecGetArrayRead(xx, &x)); 247 PetscCall(VecGetArray(zz, &z)); 248 249 v = a->a; 250 xb = x; 251 252 for (i = 0; i < mbs; i++) { 253 n = ai[1] - ai[0]; /* length of i_th block row of A */ 254 x1 = xb[0]; 255 x2 = xb[1]; 256 ib = aj + *ai; 257 jmin = 0; 258 nonzerorow += (n > 0); 259 if (*ib == i) { /* (diag of A)*x */ 260 z[2 * i] += v[0] * x1 + v[2] * x2; 261 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 262 v += 4; 263 jmin++; 264 } 265 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 266 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 267 for (j = jmin; j < n; j++) { 268 /* (strict lower triangular part of A)*x */ 269 cval = ib[j] * 2; 270 z[cval] += v[0] * x1 + v[1] * x2; 271 z[cval + 1] += v[2] * x1 + v[3] * x2; 272 /* (strict upper triangular part of A)*x */ 273 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 274 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 275 v += 4; 276 } 277 xb += 2; 278 ai++; 279 } 280 281 PetscCall(VecRestoreArrayRead(xx, &x)); 282 PetscCall(VecRestoreArray(zz, &z)); 283 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 284 PetscFunctionReturn(PETSC_SUCCESS); 285 } 286 287 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 290 PetscScalar *z, x1, x2, x3, zero = 0.0; 291 const PetscScalar *x, *xb; 292 const MatScalar *v; 293 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 294 const PetscInt *aj = a->j, *ai = a->i, *ib; 295 PetscInt nonzerorow = 0; 296 297 PetscFunctionBegin; 298 PetscCall(VecSet(zz, zero)); 299 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 300 PetscCall(VecGetArrayRead(xx, &x)); 301 PetscCall(VecGetArray(zz, &z)); 302 303 v = a->a; 304 xb = x; 305 306 for (i = 0; i < mbs; i++) { 307 n = ai[1] - ai[0]; /* length of i_th block row of A */ 308 x1 = xb[0]; 309 x2 = xb[1]; 310 x3 = xb[2]; 311 ib = aj + *ai; 312 jmin = 0; 313 nonzerorow += (n > 0); 314 if (*ib == i) { /* (diag of A)*x */ 315 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 316 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 317 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 318 v += 9; 319 jmin++; 320 } 321 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 322 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 323 for (j = jmin; j < n; j++) { 324 /* (strict lower triangular part of A)*x */ 325 cval = ib[j] * 3; 326 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 327 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 328 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 329 /* (strict upper triangular part of A)*x */ 330 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 331 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 332 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 333 v += 9; 334 } 335 xb += 3; 336 ai++; 337 } 338 339 PetscCall(VecRestoreArrayRead(xx, &x)); 340 PetscCall(VecRestoreArray(zz, &z)); 341 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 342 PetscFunctionReturn(PETSC_SUCCESS); 343 } 344 345 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 346 { 347 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 348 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 349 const PetscScalar *x, *xb; 350 const MatScalar *v; 351 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 352 const PetscInt *aj = a->j, *ai = a->i, *ib; 353 PetscInt nonzerorow = 0; 354 355 PetscFunctionBegin; 356 PetscCall(VecSet(zz, zero)); 357 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 358 PetscCall(VecGetArrayRead(xx, &x)); 359 PetscCall(VecGetArray(zz, &z)); 360 361 v = a->a; 362 xb = x; 363 364 for (i = 0; i < mbs; i++) { 365 n = ai[1] - ai[0]; /* length of i_th block row of A */ 366 x1 = xb[0]; 367 x2 = xb[1]; 368 x3 = xb[2]; 369 x4 = xb[3]; 370 ib = aj + *ai; 371 jmin = 0; 372 nonzerorow += (n > 0); 373 if (*ib == i) { /* (diag of A)*x */ 374 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 375 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 376 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 377 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 378 v += 16; 379 jmin++; 380 } 381 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 382 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 383 for (j = jmin; j < n; j++) { 384 /* (strict lower triangular part of A)*x */ 385 cval = ib[j] * 4; 386 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 387 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 388 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 389 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 390 /* (strict upper triangular part of A)*x */ 391 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 392 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 393 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 394 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 395 v += 16; 396 } 397 xb += 4; 398 ai++; 399 } 400 401 PetscCall(VecRestoreArrayRead(xx, &x)); 402 PetscCall(VecRestoreArray(zz, &z)); 403 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 404 PetscFunctionReturn(PETSC_SUCCESS); 405 } 406 407 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 408 { 409 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 410 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 411 const PetscScalar *x, *xb; 412 const MatScalar *v; 413 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 414 const PetscInt *aj = a->j, *ai = a->i, *ib; 415 PetscInt nonzerorow = 0; 416 417 PetscFunctionBegin; 418 PetscCall(VecSet(zz, zero)); 419 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 420 PetscCall(VecGetArrayRead(xx, &x)); 421 PetscCall(VecGetArray(zz, &z)); 422 423 v = a->a; 424 xb = x; 425 426 for (i = 0; i < mbs; i++) { 427 n = ai[1] - ai[0]; /* length of i_th block row of A */ 428 x1 = xb[0]; 429 x2 = xb[1]; 430 x3 = xb[2]; 431 x4 = xb[3]; 432 x5 = xb[4]; 433 ib = aj + *ai; 434 jmin = 0; 435 nonzerorow += (n > 0); 436 if (*ib == i) { /* (diag of A)*x */ 437 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 438 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 439 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 440 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 441 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 442 v += 25; 443 jmin++; 444 } 445 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 446 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 447 for (j = jmin; j < n; j++) { 448 /* (strict lower triangular part of A)*x */ 449 cval = ib[j] * 5; 450 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 451 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 452 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 453 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 454 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 455 /* (strict upper triangular part of A)*x */ 456 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 457 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 458 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 459 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 460 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 461 v += 25; 462 } 463 xb += 5; 464 ai++; 465 } 466 467 PetscCall(VecRestoreArrayRead(xx, &x)); 468 PetscCall(VecRestoreArray(zz, &z)); 469 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 470 PetscFunctionReturn(PETSC_SUCCESS); 471 } 472 473 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 474 { 475 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 476 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 477 const PetscScalar *x, *xb; 478 const MatScalar *v; 479 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 480 const PetscInt *aj = a->j, *ai = a->i, *ib; 481 PetscInt nonzerorow = 0; 482 483 PetscFunctionBegin; 484 PetscCall(VecSet(zz, zero)); 485 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 486 PetscCall(VecGetArrayRead(xx, &x)); 487 PetscCall(VecGetArray(zz, &z)); 488 489 v = a->a; 490 xb = x; 491 492 for (i = 0; i < mbs; i++) { 493 n = ai[1] - ai[0]; /* length of i_th block row of A */ 494 x1 = xb[0]; 495 x2 = xb[1]; 496 x3 = xb[2]; 497 x4 = xb[3]; 498 x5 = xb[4]; 499 x6 = xb[5]; 500 ib = aj + *ai; 501 jmin = 0; 502 nonzerorow += (n > 0); 503 if (*ib == i) { /* (diag of A)*x */ 504 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 505 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 506 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 507 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 508 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 509 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 510 v += 36; 511 jmin++; 512 } 513 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 514 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 515 for (j = jmin; j < n; j++) { 516 /* (strict lower triangular part of A)*x */ 517 cval = ib[j] * 6; 518 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 519 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 520 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 521 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 522 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 523 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 524 /* (strict upper triangular part of A)*x */ 525 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 526 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 527 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 528 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 529 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 530 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 531 v += 36; 532 } 533 xb += 6; 534 ai++; 535 } 536 537 PetscCall(VecRestoreArrayRead(xx, &x)); 538 PetscCall(VecRestoreArray(zz, &z)); 539 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 540 PetscFunctionReturn(PETSC_SUCCESS); 541 } 542 543 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 544 { 545 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 546 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 547 const PetscScalar *x, *xb; 548 const MatScalar *v; 549 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 550 const PetscInt *aj = a->j, *ai = a->i, *ib; 551 PetscInt nonzerorow = 0; 552 553 PetscFunctionBegin; 554 PetscCall(VecSet(zz, zero)); 555 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 556 PetscCall(VecGetArrayRead(xx, &x)); 557 PetscCall(VecGetArray(zz, &z)); 558 559 v = a->a; 560 xb = x; 561 562 for (i = 0; i < mbs; i++) { 563 n = ai[1] - ai[0]; /* length of i_th block row of A */ 564 x1 = xb[0]; 565 x2 = xb[1]; 566 x3 = xb[2]; 567 x4 = xb[3]; 568 x5 = xb[4]; 569 x6 = xb[5]; 570 x7 = xb[6]; 571 ib = aj + *ai; 572 jmin = 0; 573 nonzerorow += (n > 0); 574 if (*ib == i) { /* (diag of A)*x */ 575 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 576 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 577 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 578 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 579 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 580 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 581 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 582 v += 49; 583 jmin++; 584 } 585 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 586 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 587 for (j = jmin; j < n; j++) { 588 /* (strict lower triangular part of A)*x */ 589 cval = ib[j] * 7; 590 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 591 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 592 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 593 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 594 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 595 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 596 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 597 /* (strict upper triangular part of A)*x */ 598 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 599 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 600 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 601 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 602 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 603 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 604 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 605 v += 49; 606 } 607 xb += 7; 608 ai++; 609 } 610 PetscCall(VecRestoreArrayRead(xx, &x)); 611 PetscCall(VecRestoreArray(zz, &z)); 612 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 613 PetscFunctionReturn(PETSC_SUCCESS); 614 } 615 616 /* 617 This will not work with MatScalar == float because it calls the BLAS 618 */ 619 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 620 { 621 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 622 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 623 const PetscScalar *x, *x_ptr, *xb; 624 const MatScalar *v; 625 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 626 const PetscInt *idx, *aj, *ii; 627 PetscInt nonzerorow = 0; 628 629 PetscFunctionBegin; 630 PetscCall(VecSet(zz, zero)); 631 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 632 PetscCall(VecGetArrayRead(xx, &x)); 633 PetscCall(VecGetArray(zz, &z)); 634 635 x_ptr = x; 636 z_ptr = z; 637 638 aj = a->j; 639 v = a->a; 640 ii = a->i; 641 642 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 643 work = a->mult_work; 644 645 for (i = 0; i < mbs; i++) { 646 n = ii[1] - ii[0]; 647 ncols = n * bs; 648 workt = work; 649 idx = aj + ii[0]; 650 nonzerorow += (n > 0); 651 652 /* upper triangular part */ 653 for (j = 0; j < n; j++) { 654 xb = x_ptr + bs * (*idx++); 655 for (k = 0; k < bs; k++) workt[k] = xb[k]; 656 workt += bs; 657 } 658 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 659 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 660 661 /* strict lower triangular part */ 662 idx = aj + ii[0]; 663 if (n && *idx == i) { 664 ncols -= bs; 665 v += bs2; 666 idx++; 667 n--; 668 } 669 670 if (ncols > 0) { 671 workt = work; 672 PetscCall(PetscArrayzero(workt, ncols)); 673 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 674 for (j = 0; j < n; j++) { 675 zb = z_ptr + bs * (*idx++); 676 for (k = 0; k < bs; k++) zb[k] += workt[k]; 677 workt += bs; 678 } 679 } 680 x += bs; 681 v += n * bs2; 682 z += bs; 683 ii++; 684 } 685 686 PetscCall(VecRestoreArrayRead(xx, &x)); 687 PetscCall(VecRestoreArray(zz, &z)); 688 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 689 PetscFunctionReturn(PETSC_SUCCESS); 690 } 691 692 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 693 { 694 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 695 PetscScalar *z, x1; 696 const PetscScalar *x, *xb; 697 const MatScalar *v; 698 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 699 const PetscInt *aj = a->j, *ai = a->i, *ib; 700 PetscInt nonzerorow = 0; 701 #if defined(PETSC_USE_COMPLEX) 702 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 703 #else 704 const int aconj = 0; 705 #endif 706 707 PetscFunctionBegin; 708 PetscCall(VecCopy(yy, zz)); 709 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 710 PetscCall(VecGetArrayRead(xx, &x)); 711 PetscCall(VecGetArray(zz, &z)); 712 v = a->a; 713 xb = x; 714 715 for (i = 0; i < mbs; i++) { 716 n = ai[1] - ai[0]; /* length of i_th row of A */ 717 x1 = xb[0]; 718 ib = aj + *ai; 719 jmin = 0; 720 nonzerorow += (n > 0); 721 if (n && *ib == i) { /* (diag of A)*x */ 722 z[i] += *v++ * x[*ib++]; 723 jmin++; 724 } 725 if (aconj) { 726 for (j = jmin; j < n; j++) { 727 cval = *ib; 728 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 729 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 730 } 731 } else { 732 for (j = jmin; j < n; j++) { 733 cval = *ib; 734 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 735 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 736 } 737 } 738 xb++; 739 ai++; 740 } 741 742 PetscCall(VecRestoreArrayRead(xx, &x)); 743 PetscCall(VecRestoreArray(zz, &z)); 744 745 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 746 PetscFunctionReturn(PETSC_SUCCESS); 747 } 748 749 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 750 { 751 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 752 PetscScalar *z, x1, x2; 753 const PetscScalar *x, *xb; 754 const MatScalar *v; 755 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 756 const PetscInt *aj = a->j, *ai = a->i, *ib; 757 PetscInt nonzerorow = 0; 758 759 PetscFunctionBegin; 760 PetscCall(VecCopy(yy, zz)); 761 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 762 PetscCall(VecGetArrayRead(xx, &x)); 763 PetscCall(VecGetArray(zz, &z)); 764 765 v = a->a; 766 xb = x; 767 768 for (i = 0; i < mbs; i++) { 769 n = ai[1] - ai[0]; /* length of i_th block row of A */ 770 x1 = xb[0]; 771 x2 = xb[1]; 772 ib = aj + *ai; 773 jmin = 0; 774 nonzerorow += (n > 0); 775 if (n && *ib == i) { /* (diag of A)*x */ 776 z[2 * i] += v[0] * x1 + v[2] * x2; 777 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 778 v += 4; 779 jmin++; 780 } 781 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 782 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 783 for (j = jmin; j < n; j++) { 784 /* (strict lower triangular part of A)*x */ 785 cval = ib[j] * 2; 786 z[cval] += v[0] * x1 + v[1] * x2; 787 z[cval + 1] += v[2] * x1 + v[3] * x2; 788 /* (strict upper triangular part of A)*x */ 789 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 790 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 791 v += 4; 792 } 793 xb += 2; 794 ai++; 795 } 796 PetscCall(VecRestoreArrayRead(xx, &x)); 797 PetscCall(VecRestoreArray(zz, &z)); 798 799 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 800 PetscFunctionReturn(PETSC_SUCCESS); 801 } 802 803 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 804 { 805 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 806 PetscScalar *z, x1, x2, x3; 807 const PetscScalar *x, *xb; 808 const MatScalar *v; 809 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 810 const PetscInt *aj = a->j, *ai = a->i, *ib; 811 PetscInt nonzerorow = 0; 812 813 PetscFunctionBegin; 814 PetscCall(VecCopy(yy, zz)); 815 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 816 PetscCall(VecGetArrayRead(xx, &x)); 817 PetscCall(VecGetArray(zz, &z)); 818 819 v = a->a; 820 xb = x; 821 822 for (i = 0; i < mbs; i++) { 823 n = ai[1] - ai[0]; /* length of i_th block row of A */ 824 x1 = xb[0]; 825 x2 = xb[1]; 826 x3 = xb[2]; 827 ib = aj + *ai; 828 jmin = 0; 829 nonzerorow += (n > 0); 830 if (n && *ib == i) { /* (diag of A)*x */ 831 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 832 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 833 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 834 v += 9; 835 jmin++; 836 } 837 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 838 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 839 for (j = jmin; j < n; j++) { 840 /* (strict lower triangular part of A)*x */ 841 cval = ib[j] * 3; 842 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 843 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 844 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 845 /* (strict upper triangular part of A)*x */ 846 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 847 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 848 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 849 v += 9; 850 } 851 xb += 3; 852 ai++; 853 } 854 855 PetscCall(VecRestoreArrayRead(xx, &x)); 856 PetscCall(VecRestoreArray(zz, &z)); 857 858 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 859 PetscFunctionReturn(PETSC_SUCCESS); 860 } 861 862 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 863 { 864 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 865 PetscScalar *z, x1, x2, x3, x4; 866 const PetscScalar *x, *xb; 867 const MatScalar *v; 868 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 869 const PetscInt *aj = a->j, *ai = a->i, *ib; 870 PetscInt nonzerorow = 0; 871 872 PetscFunctionBegin; 873 PetscCall(VecCopy(yy, zz)); 874 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 875 PetscCall(VecGetArrayRead(xx, &x)); 876 PetscCall(VecGetArray(zz, &z)); 877 878 v = a->a; 879 xb = x; 880 881 for (i = 0; i < mbs; i++) { 882 n = ai[1] - ai[0]; /* length of i_th block row of A */ 883 x1 = xb[0]; 884 x2 = xb[1]; 885 x3 = xb[2]; 886 x4 = xb[3]; 887 ib = aj + *ai; 888 jmin = 0; 889 nonzerorow += (n > 0); 890 if (n && *ib == i) { /* (diag of A)*x */ 891 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 892 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 893 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 894 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 895 v += 16; 896 jmin++; 897 } 898 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 899 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 900 for (j = jmin; j < n; j++) { 901 /* (strict lower triangular part of A)*x */ 902 cval = ib[j] * 4; 903 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 904 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 905 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 906 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 907 /* (strict upper triangular part of A)*x */ 908 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 909 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 910 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 911 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 912 v += 16; 913 } 914 xb += 4; 915 ai++; 916 } 917 918 PetscCall(VecRestoreArrayRead(xx, &x)); 919 PetscCall(VecRestoreArray(zz, &z)); 920 921 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 922 PetscFunctionReturn(PETSC_SUCCESS); 923 } 924 925 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 926 { 927 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 928 PetscScalar *z, x1, x2, x3, x4, x5; 929 const PetscScalar *x, *xb; 930 const MatScalar *v; 931 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 932 const PetscInt *aj = a->j, *ai = a->i, *ib; 933 PetscInt nonzerorow = 0; 934 935 PetscFunctionBegin; 936 PetscCall(VecCopy(yy, zz)); 937 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 938 PetscCall(VecGetArrayRead(xx, &x)); 939 PetscCall(VecGetArray(zz, &z)); 940 941 v = a->a; 942 xb = x; 943 944 for (i = 0; i < mbs; i++) { 945 n = ai[1] - ai[0]; /* length of i_th block row of A */ 946 x1 = xb[0]; 947 x2 = xb[1]; 948 x3 = xb[2]; 949 x4 = xb[3]; 950 x5 = xb[4]; 951 ib = aj + *ai; 952 jmin = 0; 953 nonzerorow += (n > 0); 954 if (n && *ib == i) { /* (diag of A)*x */ 955 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 956 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 957 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 958 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 959 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 960 v += 25; 961 jmin++; 962 } 963 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 964 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 965 for (j = jmin; j < n; j++) { 966 /* (strict lower triangular part of A)*x */ 967 cval = ib[j] * 5; 968 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 969 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 970 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 971 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 972 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 973 /* (strict upper triangular part of A)*x */ 974 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 975 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 976 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 977 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 978 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 979 v += 25; 980 } 981 xb += 5; 982 ai++; 983 } 984 985 PetscCall(VecRestoreArrayRead(xx, &x)); 986 PetscCall(VecRestoreArray(zz, &z)); 987 988 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 989 PetscFunctionReturn(PETSC_SUCCESS); 990 } 991 992 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 993 { 994 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 995 PetscScalar *z, x1, x2, x3, x4, x5, x6; 996 const PetscScalar *x, *xb; 997 const MatScalar *v; 998 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 999 const PetscInt *aj = a->j, *ai = a->i, *ib; 1000 PetscInt nonzerorow = 0; 1001 1002 PetscFunctionBegin; 1003 PetscCall(VecCopy(yy, zz)); 1004 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1005 PetscCall(VecGetArrayRead(xx, &x)); 1006 PetscCall(VecGetArray(zz, &z)); 1007 1008 v = a->a; 1009 xb = x; 1010 1011 for (i = 0; i < mbs; i++) { 1012 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1013 x1 = xb[0]; 1014 x2 = xb[1]; 1015 x3 = xb[2]; 1016 x4 = xb[3]; 1017 x5 = xb[4]; 1018 x6 = xb[5]; 1019 ib = aj + *ai; 1020 jmin = 0; 1021 nonzerorow += (n > 0); 1022 if (n && *ib == i) { /* (diag of A)*x */ 1023 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1024 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1025 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1026 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1027 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1028 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1029 v += 36; 1030 jmin++; 1031 } 1032 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1033 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1034 for (j = jmin; j < n; j++) { 1035 /* (strict lower triangular part of A)*x */ 1036 cval = ib[j] * 6; 1037 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1038 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1039 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1040 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1041 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1042 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1043 /* (strict upper triangular part of A)*x */ 1044 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1045 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1046 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1047 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1048 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1049 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1050 v += 36; 1051 } 1052 xb += 6; 1053 ai++; 1054 } 1055 1056 PetscCall(VecRestoreArrayRead(xx, &x)); 1057 PetscCall(VecRestoreArray(zz, &z)); 1058 1059 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1060 PetscFunctionReturn(PETSC_SUCCESS); 1061 } 1062 1063 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1064 { 1065 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1066 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1067 const PetscScalar *x, *xb; 1068 const MatScalar *v; 1069 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1070 const PetscInt *aj = a->j, *ai = a->i, *ib; 1071 PetscInt nonzerorow = 0; 1072 1073 PetscFunctionBegin; 1074 PetscCall(VecCopy(yy, zz)); 1075 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1076 PetscCall(VecGetArrayRead(xx, &x)); 1077 PetscCall(VecGetArray(zz, &z)); 1078 1079 v = a->a; 1080 xb = x; 1081 1082 for (i = 0; i < mbs; i++) { 1083 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1084 x1 = xb[0]; 1085 x2 = xb[1]; 1086 x3 = xb[2]; 1087 x4 = xb[3]; 1088 x5 = xb[4]; 1089 x6 = xb[5]; 1090 x7 = xb[6]; 1091 ib = aj + *ai; 1092 jmin = 0; 1093 nonzerorow += (n > 0); 1094 if (n && *ib == i) { /* (diag of A)*x */ 1095 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1096 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1097 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1098 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1099 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1100 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1101 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1102 v += 49; 1103 jmin++; 1104 } 1105 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1106 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1107 for (j = jmin; j < n; j++) { 1108 /* (strict lower triangular part of A)*x */ 1109 cval = ib[j] * 7; 1110 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1111 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1112 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1113 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1114 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1115 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1116 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1117 /* (strict upper triangular part of A)*x */ 1118 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1119 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1120 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1121 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1122 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1123 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1124 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1125 v += 49; 1126 } 1127 xb += 7; 1128 ai++; 1129 } 1130 1131 PetscCall(VecRestoreArrayRead(xx, &x)); 1132 PetscCall(VecRestoreArray(zz, &z)); 1133 1134 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1135 PetscFunctionReturn(PETSC_SUCCESS); 1136 } 1137 1138 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1139 { 1140 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1141 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1142 const PetscScalar *x, *x_ptr, *xb; 1143 const MatScalar *v; 1144 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1145 const PetscInt *idx, *aj, *ii; 1146 PetscInt nonzerorow = 0; 1147 1148 PetscFunctionBegin; 1149 PetscCall(VecCopy(yy, zz)); 1150 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1151 PetscCall(VecGetArrayRead(xx, &x)); 1152 x_ptr = x; 1153 PetscCall(VecGetArray(zz, &z)); 1154 z_ptr = z; 1155 1156 aj = a->j; 1157 v = a->a; 1158 ii = a->i; 1159 1160 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1161 work = a->mult_work; 1162 1163 for (i = 0; i < mbs; i++) { 1164 n = ii[1] - ii[0]; 1165 ncols = n * bs; 1166 workt = work; 1167 idx = aj + ii[0]; 1168 nonzerorow += (n > 0); 1169 1170 /* upper triangular part */ 1171 for (j = 0; j < n; j++) { 1172 xb = x_ptr + bs * (*idx++); 1173 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1174 workt += bs; 1175 } 1176 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1177 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1178 1179 /* strict lower triangular part */ 1180 idx = aj + ii[0]; 1181 if (n && *idx == i) { 1182 ncols -= bs; 1183 v += bs2; 1184 idx++; 1185 n--; 1186 } 1187 if (ncols > 0) { 1188 workt = work; 1189 PetscCall(PetscArrayzero(workt, ncols)); 1190 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1191 for (j = 0; j < n; j++) { 1192 zb = z_ptr + bs * (*idx++); 1193 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1194 workt += bs; 1195 } 1196 } 1197 1198 x += bs; 1199 v += n * bs2; 1200 z += bs; 1201 ii++; 1202 } 1203 1204 PetscCall(VecRestoreArrayRead(xx, &x)); 1205 PetscCall(VecRestoreArray(zz, &z)); 1206 1207 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 1211 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1212 { 1213 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1214 PetscScalar oalpha = alpha; 1215 PetscBLASInt one = 1, totalnz; 1216 1217 PetscFunctionBegin; 1218 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1219 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1220 PetscCall(PetscLogFlops(totalnz)); 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1225 { 1226 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1227 const MatScalar *v = a->a; 1228 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1229 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1230 const PetscInt *aj = a->j, *col; 1231 1232 PetscFunctionBegin; 1233 if (!a->nz) { 1234 *norm = 0.0; 1235 PetscFunctionReturn(PETSC_SUCCESS); 1236 } 1237 if (type == NORM_FROBENIUS) { 1238 for (k = 0; k < mbs; k++) { 1239 jmin = a->i[k]; 1240 jmax = a->i[k + 1]; 1241 col = aj + jmin; 1242 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1243 for (i = 0; i < bs2; i++) { 1244 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1245 v++; 1246 } 1247 jmin++; 1248 } 1249 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1250 for (i = 0; i < bs2; i++) { 1251 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1252 v++; 1253 } 1254 } 1255 } 1256 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1257 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1258 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1259 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1260 for (i = 0; i < mbs; i++) jl[i] = mbs; 1261 il[0] = 0; 1262 1263 *norm = 0.0; 1264 for (k = 0; k < mbs; k++) { /* k_th block row */ 1265 for (j = 0; j < bs; j++) sum[j] = 0.0; 1266 /*-- col sum --*/ 1267 i = jl[k]; /* first |A(i,k)| to be added */ 1268 /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1269 at step k */ 1270 while (i < mbs) { 1271 nexti = jl[i]; /* next block row to be added */ 1272 ik = il[i]; /* block index of A(i,k) in the array a */ 1273 for (j = 0; j < bs; j++) { 1274 v = a->a + ik * bs2 + j * bs; 1275 for (k1 = 0; k1 < bs; k1++) { 1276 sum[j] += PetscAbsScalar(*v); 1277 v++; 1278 } 1279 } 1280 /* update il, jl */ 1281 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1282 jmax = a->i[i + 1]; 1283 if (jmin < jmax) { 1284 il[i] = jmin; 1285 j = a->j[jmin]; 1286 jl[i] = jl[j]; 1287 jl[j] = i; 1288 } 1289 i = nexti; 1290 } 1291 /*-- row sum --*/ 1292 jmin = a->i[k]; 1293 jmax = a->i[k + 1]; 1294 for (i = jmin; i < jmax; i++) { 1295 for (j = 0; j < bs; j++) { 1296 v = a->a + i * bs2 + j; 1297 for (k1 = 0; k1 < bs; k1++) { 1298 sum[j] += PetscAbsScalar(*v); 1299 v += bs; 1300 } 1301 } 1302 } 1303 /* add k_th block row to il, jl */ 1304 col = aj + jmin; 1305 if (jmax - jmin > 0 && *col == k) jmin++; 1306 if (jmin < jmax) { 1307 il[k] = jmin; 1308 j = a->j[jmin]; 1309 jl[k] = jl[j]; 1310 jl[j] = k; 1311 } 1312 for (j = 0; j < bs; j++) { 1313 if (sum[j] > *norm) *norm = sum[j]; 1314 } 1315 } 1316 PetscCall(PetscFree3(sum, il, jl)); 1317 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1318 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1319 PetscFunctionReturn(PETSC_SUCCESS); 1320 } 1321 1322 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1323 { 1324 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1325 1326 PetscFunctionBegin; 1327 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1328 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1329 *flg = PETSC_FALSE; 1330 PetscFunctionReturn(PETSC_SUCCESS); 1331 } 1332 1333 /* if the a->i are the same */ 1334 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1335 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1336 1337 /* if a->j are the same */ 1338 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1339 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1340 1341 /* if a->a are the same */ 1342 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1343 PetscFunctionReturn(PETSC_SUCCESS); 1344 } 1345 1346 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1347 { 1348 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1349 PetscInt i, j, k, row, bs, ambs, bs2; 1350 const PetscInt *ai, *aj; 1351 PetscScalar *x, zero = 0.0; 1352 const MatScalar *aa, *aa_j; 1353 1354 PetscFunctionBegin; 1355 bs = A->rmap->bs; 1356 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1357 1358 aa = a->a; 1359 ambs = a->mbs; 1360 1361 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1362 PetscInt *diag = a->diag; 1363 aa = a->a; 1364 ambs = a->mbs; 1365 PetscCall(VecGetArray(v, &x)); 1366 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1367 PetscCall(VecRestoreArray(v, &x)); 1368 PetscFunctionReturn(PETSC_SUCCESS); 1369 } 1370 1371 ai = a->i; 1372 aj = a->j; 1373 bs2 = a->bs2; 1374 PetscCall(VecSet(v, zero)); 1375 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1376 PetscCall(VecGetArray(v, &x)); 1377 for (i = 0; i < ambs; i++) { 1378 j = ai[i]; 1379 if (aj[j] == i) { /* if this is a diagonal element */ 1380 row = i * bs; 1381 aa_j = aa + j * bs2; 1382 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1383 } 1384 } 1385 PetscCall(VecRestoreArray(v, &x)); 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1390 { 1391 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1392 PetscScalar x; 1393 const PetscScalar *l, *li, *ri; 1394 MatScalar *aa, *v; 1395 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1396 const PetscInt *ai, *aj; 1397 PetscBool flg; 1398 1399 PetscFunctionBegin; 1400 if (ll != rr) { 1401 PetscCall(VecEqual(ll, rr, &flg)); 1402 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1403 } 1404 if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 1405 ai = a->i; 1406 aj = a->j; 1407 aa = a->a; 1408 m = A->rmap->N; 1409 bs = A->rmap->bs; 1410 mbs = a->mbs; 1411 bs2 = a->bs2; 1412 1413 PetscCall(VecGetArrayRead(ll, &l)); 1414 PetscCall(VecGetLocalSize(ll, &lm)); 1415 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1416 for (i = 0; i < mbs; i++) { /* for each block row */ 1417 M = ai[i + 1] - ai[i]; 1418 li = l + i * bs; 1419 v = aa + bs2 * ai[i]; 1420 for (j = 0; j < M; j++) { /* for each block */ 1421 ri = l + bs * aj[ai[i] + j]; 1422 for (k = 0; k < bs; k++) { 1423 x = ri[k]; 1424 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1425 } 1426 } 1427 } 1428 PetscCall(VecRestoreArrayRead(ll, &l)); 1429 PetscCall(PetscLogFlops(2.0 * a->nz)); 1430 PetscFunctionReturn(PETSC_SUCCESS); 1431 } 1432 1433 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1434 { 1435 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1436 1437 PetscFunctionBegin; 1438 info->block_size = a->bs2; 1439 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1440 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1441 info->nz_unneeded = info->nz_allocated - info->nz_used; 1442 info->assemblies = A->num_ass; 1443 info->mallocs = A->info.mallocs; 1444 info->memory = 0; /* REVIEW ME */ 1445 if (A->factortype) { 1446 info->fill_ratio_given = A->info.fill_ratio_given; 1447 info->fill_ratio_needed = A->info.fill_ratio_needed; 1448 info->factor_mallocs = A->info.factor_mallocs; 1449 } else { 1450 info->fill_ratio_given = 0; 1451 info->fill_ratio_needed = 0; 1452 info->factor_mallocs = 0; 1453 } 1454 PetscFunctionReturn(PETSC_SUCCESS); 1455 } 1456 1457 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1458 { 1459 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1460 1461 PetscFunctionBegin; 1462 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1463 PetscFunctionReturn(PETSC_SUCCESS); 1464 } 1465 1466 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1467 { 1468 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1469 PetscInt i, j, n, row, col, bs, mbs; 1470 const PetscInt *ai, *aj; 1471 PetscReal atmp; 1472 const MatScalar *aa; 1473 PetscScalar *x; 1474 PetscInt ncols, brow, bcol, krow, kcol; 1475 1476 PetscFunctionBegin; 1477 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1478 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1479 bs = A->rmap->bs; 1480 aa = a->a; 1481 ai = a->i; 1482 aj = a->j; 1483 mbs = a->mbs; 1484 1485 PetscCall(VecSet(v, 0.0)); 1486 PetscCall(VecGetArray(v, &x)); 1487 PetscCall(VecGetLocalSize(v, &n)); 1488 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1489 for (i = 0; i < mbs; i++) { 1490 ncols = ai[1] - ai[0]; 1491 ai++; 1492 brow = bs * i; 1493 for (j = 0; j < ncols; j++) { 1494 bcol = bs * (*aj); 1495 for (kcol = 0; kcol < bs; kcol++) { 1496 col = bcol + kcol; /* col index */ 1497 for (krow = 0; krow < bs; krow++) { 1498 atmp = PetscAbsScalar(*aa); 1499 aa++; 1500 row = brow + krow; /* row index */ 1501 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1502 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1503 } 1504 } 1505 aj++; 1506 } 1507 } 1508 PetscCall(VecRestoreArray(v, &x)); 1509 PetscFunctionReturn(PETSC_SUCCESS); 1510 } 1511 1512 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1513 { 1514 PetscFunctionBegin; 1515 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1516 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1517 PetscFunctionReturn(PETSC_SUCCESS); 1518 } 1519 1520 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1521 { 1522 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1523 PetscScalar *z = c; 1524 const PetscScalar *xb; 1525 PetscScalar x1; 1526 const MatScalar *v = a->a, *vv; 1527 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1528 #if defined(PETSC_USE_COMPLEX) 1529 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1530 #else 1531 const int aconj = 0; 1532 #endif 1533 1534 PetscFunctionBegin; 1535 for (i = 0; i < mbs; i++) { 1536 n = ii[1] - ii[0]; 1537 ii++; 1538 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1539 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1540 jj = idx; 1541 vv = v; 1542 for (k = 0; k < cn; k++) { 1543 idx = jj; 1544 v = vv; 1545 for (j = 0; j < n; j++) { 1546 xb = b + (*idx); 1547 x1 = xb[0 + k * bm]; 1548 z[0 + k * cm] += v[0] * x1; 1549 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1550 v += 1; 1551 ++idx; 1552 } 1553 } 1554 z += 1; 1555 } 1556 PetscFunctionReturn(PETSC_SUCCESS); 1557 } 1558 1559 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1560 { 1561 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1562 PetscScalar *z = c; 1563 const PetscScalar *xb; 1564 PetscScalar x1, x2; 1565 const MatScalar *v = a->a, *vv; 1566 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1567 1568 PetscFunctionBegin; 1569 for (i = 0; i < mbs; i++) { 1570 n = ii[1] - ii[0]; 1571 ii++; 1572 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1573 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1574 jj = idx; 1575 vv = v; 1576 for (k = 0; k < cn; k++) { 1577 idx = jj; 1578 v = vv; 1579 for (j = 0; j < n; j++) { 1580 xb = b + 2 * (*idx); 1581 x1 = xb[0 + k * bm]; 1582 x2 = xb[1 + k * bm]; 1583 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1584 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1585 if (*idx != i) { 1586 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1587 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1588 } 1589 v += 4; 1590 ++idx; 1591 } 1592 } 1593 z += 2; 1594 } 1595 PetscFunctionReturn(PETSC_SUCCESS); 1596 } 1597 1598 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1599 { 1600 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1601 PetscScalar *z = c; 1602 const PetscScalar *xb; 1603 PetscScalar x1, x2, x3; 1604 const MatScalar *v = a->a, *vv; 1605 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1606 1607 PetscFunctionBegin; 1608 for (i = 0; i < mbs; i++) { 1609 n = ii[1] - ii[0]; 1610 ii++; 1611 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1612 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1613 jj = idx; 1614 vv = v; 1615 for (k = 0; k < cn; k++) { 1616 idx = jj; 1617 v = vv; 1618 for (j = 0; j < n; j++) { 1619 xb = b + 3 * (*idx); 1620 x1 = xb[0 + k * bm]; 1621 x2 = xb[1 + k * bm]; 1622 x3 = xb[2 + k * bm]; 1623 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1624 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1625 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1626 if (*idx != i) { 1627 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1628 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1629 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1630 } 1631 v += 9; 1632 ++idx; 1633 } 1634 } 1635 z += 3; 1636 } 1637 PetscFunctionReturn(PETSC_SUCCESS); 1638 } 1639 1640 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1641 { 1642 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1643 PetscScalar *z = c; 1644 const PetscScalar *xb; 1645 PetscScalar x1, x2, x3, x4; 1646 const MatScalar *v = a->a, *vv; 1647 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1648 1649 PetscFunctionBegin; 1650 for (i = 0; i < mbs; i++) { 1651 n = ii[1] - ii[0]; 1652 ii++; 1653 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1654 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1655 jj = idx; 1656 vv = v; 1657 for (k = 0; k < cn; k++) { 1658 idx = jj; 1659 v = vv; 1660 for (j = 0; j < n; j++) { 1661 xb = b + 4 * (*idx); 1662 x1 = xb[0 + k * bm]; 1663 x2 = xb[1 + k * bm]; 1664 x3 = xb[2 + k * bm]; 1665 x4 = xb[3 + k * bm]; 1666 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1667 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1668 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1669 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1670 if (*idx != i) { 1671 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1672 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1673 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1674 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1675 } 1676 v += 16; 1677 ++idx; 1678 } 1679 } 1680 z += 4; 1681 } 1682 PetscFunctionReturn(PETSC_SUCCESS); 1683 } 1684 1685 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1686 { 1687 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1688 PetscScalar *z = c; 1689 const PetscScalar *xb; 1690 PetscScalar x1, x2, x3, x4, x5; 1691 const MatScalar *v = a->a, *vv; 1692 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1693 1694 PetscFunctionBegin; 1695 for (i = 0; i < mbs; i++) { 1696 n = ii[1] - ii[0]; 1697 ii++; 1698 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1699 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1700 jj = idx; 1701 vv = v; 1702 for (k = 0; k < cn; k++) { 1703 idx = jj; 1704 v = vv; 1705 for (j = 0; j < n; j++) { 1706 xb = b + 5 * (*idx); 1707 x1 = xb[0 + k * bm]; 1708 x2 = xb[1 + k * bm]; 1709 x3 = xb[2 + k * bm]; 1710 x4 = xb[3 + k * bm]; 1711 x5 = xb[4 + k * cm]; 1712 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1713 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1714 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1715 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1716 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1717 if (*idx != i) { 1718 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1719 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1720 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1721 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1722 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1723 } 1724 v += 25; 1725 ++idx; 1726 } 1727 } 1728 z += 5; 1729 } 1730 PetscFunctionReturn(PETSC_SUCCESS); 1731 } 1732 1733 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1734 { 1735 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1736 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1737 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1738 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1739 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1740 PetscBLASInt bbs, bcn, bbm, bcm; 1741 PetscScalar *z = NULL; 1742 PetscScalar *c, *b; 1743 const MatScalar *v; 1744 const PetscInt *idx, *ii; 1745 PetscScalar _DOne = 1.0; 1746 1747 PetscFunctionBegin; 1748 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1749 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1750 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1751 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1752 b = bd->v; 1753 PetscCall(MatZeroEntries(C)); 1754 PetscCall(MatDenseGetArray(C, &c)); 1755 switch (bs) { 1756 case 1: 1757 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1758 break; 1759 case 2: 1760 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1761 break; 1762 case 3: 1763 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1764 break; 1765 case 4: 1766 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1767 break; 1768 case 5: 1769 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1770 break; 1771 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1772 PetscCall(PetscBLASIntCast(bs, &bbs)); 1773 PetscCall(PetscBLASIntCast(cn, &bcn)); 1774 PetscCall(PetscBLASIntCast(bm, &bbm)); 1775 PetscCall(PetscBLASIntCast(cm, &bcm)); 1776 idx = a->j; 1777 v = a->a; 1778 mbs = a->mbs; 1779 ii = a->i; 1780 z = c; 1781 for (i = 0; i < mbs; i++) { 1782 n = ii[1] - ii[0]; 1783 ii++; 1784 for (j = 0; j < n; j++) { 1785 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1786 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1787 v += bs2; 1788 } 1789 z += bs; 1790 } 1791 } 1792 PetscCall(MatDenseRestoreArray(C, &c)); 1793 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1794 PetscFunctionReturn(PETSC_SUCCESS); 1795 } 1796