1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <../src/mat/impls/sbaij/seq/sbaij.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 9 { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 11 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 12 const PetscInt *idx; 13 PetscBT table_out, table_in; 14 15 PetscFunctionBegin; 16 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17 mbs = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 PetscCall(PetscBTCreate(mbs, &table_out)); 22 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 23 PetscCall(PetscBTCreate(mbs, &table_in)); 24 25 for (i = 0; i < is_max; i++) { /* for each is */ 26 isz = 0; 27 PetscCall(PetscBTMemzero(mbs, table_out)); 28 29 /* Extract the indices, assume there can be duplicate entries */ 30 PetscCall(ISGetIndices(is[i], &idx)); 31 PetscCall(ISGetLocalSize(is[i], &n)); 32 33 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 34 bcol_max = 0; 35 for (j = 0; j < n; ++j) { 36 brow = idx[j] / bs; /* convert the indices into block indices */ 37 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 38 if (!PetscBTLookupSet(table_out, brow)) { 39 nidx[isz++] = brow; 40 if (bcol_max < brow) bcol_max = brow; 41 } 42 } 43 PetscCall(ISRestoreIndices(is[i], &idx)); 44 PetscCall(ISDestroy(&is[i])); 45 46 k = 0; 47 for (j = 0; j < ov; j++) { /* for each overlap */ 48 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 49 PetscCall(PetscBTMemzero(mbs, table_in)); 50 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 51 52 n = isz; /* length of the updated is[i] */ 53 for (brow = 0; brow < mbs; brow++) { 54 start = ai[brow]; 55 end = ai[brow + 1]; 56 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 57 for (l = start; l < end; l++) { 58 bcol = aj[l]; 59 if (!PetscBTLookupSet(table_out, bcol)) { 60 nidx[isz++] = bcol; 61 if (bcol_max < bcol) bcol_max = bcol; 62 } 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l < end; l++) { 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table_in, bcol)) { 71 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 79 } /* for each is */ 80 PetscCall(PetscBTDestroy(&table_out)); 81 PetscCall(PetscFree(nidx)); 82 PetscCall(PetscBTDestroy(&table_in)); 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 87 Zero some ops' to avoid invalid use */ 88 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 89 { 90 PetscFunctionBegin; 91 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 92 Bseq->ops->mult = NULL; 93 Bseq->ops->multadd = NULL; 94 Bseq->ops->multtranspose = NULL; 95 Bseq->ops->multtransposeadd = NULL; 96 Bseq->ops->lufactor = NULL; 97 Bseq->ops->choleskyfactor = NULL; 98 Bseq->ops->lufactorsymbolic = NULL; 99 Bseq->ops->choleskyfactorsymbolic = NULL; 100 Bseq->ops->getinertia = NULL; 101 PetscFunctionReturn(PETSC_SUCCESS); 102 } 103 104 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 105 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 106 { 107 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 108 Mat_SeqBAIJ *d = NULL; 109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111 const PetscInt *irow, *icol; 112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113 PetscInt *aj = a->j, *ai = a->i; 114 MatScalar *mat_a; 115 Mat C; 116 PetscBool flag; 117 118 PetscFunctionBegin; 119 PetscCall(ISGetIndices(isrow, &irow)); 120 PetscCall(ISGetIndices(iscol, &icol)); 121 PetscCall(ISGetLocalSize(isrow, &nrows)); 122 PetscCall(ISGetLocalSize(iscol, &ncols)); 123 124 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 125 ssmap = smap; 126 PetscCall(PetscMalloc1(1 + nrows, &lens)); 127 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 128 /* determine lens of each row */ 129 for (i = 0; i < nrows; i++) { 130 kstart = ai[irow[i]]; 131 kend = kstart + a->ilen[irow[i]]; 132 lens[i] = 0; 133 for (k = kstart; k < kend; k++) { 134 if (ssmap[aj[k]]) lens[i]++; 135 } 136 } 137 /* Create and fill new matrix */ 138 if (scall == MAT_REUSE_MATRIX) { 139 if (sym) { 140 c = (Mat_SeqSBAIJ *)((*B)->data); 141 142 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 143 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 144 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 145 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146 } else { 147 d = (Mat_SeqBAIJ *)((*B)->data); 148 149 PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 150 PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 151 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 152 PetscCall(PetscArrayzero(d->ilen, d->mbs)); 153 } 154 C = *B; 155 } else { 156 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 157 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 158 if (sym) { 159 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 160 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 161 } else { 162 PetscCall(MatSetType(C, MATSEQBAIJ)); 163 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 164 } 165 } 166 if (sym) c = (Mat_SeqSBAIJ *)C->data; 167 else d = (Mat_SeqBAIJ *)C->data; 168 for (i = 0; i < nrows; i++) { 169 row = irow[i]; 170 kstart = ai[row]; 171 kend = kstart + a->ilen[row]; 172 if (sym) { 173 mat_i = c->i[i]; 174 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 175 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 176 mat_ilen = c->ilen + i; 177 } else { 178 mat_i = d->i[i]; 179 mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 180 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 181 mat_ilen = d->ilen + i; 182 } 183 for (k = kstart; k < kend; k++) { 184 if ((tcol = ssmap[a->j[k]])) { 185 *mat_j++ = tcol - 1; 186 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 187 mat_a += bs2; 188 (*mat_ilen)++; 189 } 190 } 191 } 192 /* sort */ 193 { 194 MatScalar *work; 195 196 PetscCall(PetscMalloc1(bs2, &work)); 197 for (i = 0; i < nrows; i++) { 198 PetscInt ilen; 199 if (sym) { 200 mat_i = c->i[i]; 201 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 202 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 203 ilen = c->ilen[i]; 204 } else { 205 mat_i = d->i[i]; 206 mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 207 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 208 ilen = d->ilen[i]; 209 } 210 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 211 } 212 PetscCall(PetscFree(work)); 213 } 214 215 /* Free work space */ 216 PetscCall(ISRestoreIndices(iscol, &icol)); 217 PetscCall(PetscFree(smap)); 218 PetscCall(PetscFree(lens)); 219 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 220 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 221 222 PetscCall(ISRestoreIndices(isrow, &irow)); 223 *B = C; 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 228 { 229 Mat C[2]; 230 IS is1, is2, intersect = NULL; 231 PetscInt n1, n2, ni; 232 PetscBool sym = PETSC_TRUE; 233 234 PetscFunctionBegin; 235 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 236 if (isrow == iscol) { 237 is2 = is1; 238 PetscCall(PetscObjectReference((PetscObject)is2)); 239 } else { 240 PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 241 PetscCall(ISIntersect(is1, is2, &intersect)); 242 PetscCall(ISGetLocalSize(intersect, &ni)); 243 PetscCall(ISDestroy(&intersect)); 244 if (ni == 0) sym = PETSC_FALSE; 245 else if (PetscDefined(USE_DEBUG)) { 246 PetscCall(ISGetLocalSize(is1, &n1)); 247 PetscCall(ISGetLocalSize(is2, &n2)); 248 PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 249 } 250 } 251 if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 252 else { 253 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 254 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 255 PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 256 PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 257 PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 258 if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 259 else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 260 else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 261 PetscCall(MatDestroy(C)); 262 PetscCall(MatDestroy(C + 1)); 263 } 264 PetscCall(ISDestroy(&is1)); 265 PetscCall(ISDestroy(&is2)); 266 267 if (sym && isrow != iscol) { 268 PetscBool isequal; 269 PetscCall(ISEqual(isrow, iscol, &isequal)); 270 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 271 } 272 PetscFunctionReturn(PETSC_SUCCESS); 273 } 274 275 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 276 { 277 PetscInt i; 278 279 PetscFunctionBegin; 280 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 281 282 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 283 PetscFunctionReturn(PETSC_SUCCESS); 284 } 285 286 /* Should check that shapes of vectors and matrices match */ 287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 290 PetscScalar *z, x1, x2, zero = 0.0; 291 const PetscScalar *x, *xb; 292 const MatScalar *v; 293 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 294 const PetscInt *aj = a->j, *ai = a->i, *ib; 295 PetscInt nonzerorow = 0; 296 297 PetscFunctionBegin; 298 PetscCall(VecSet(zz, zero)); 299 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 300 PetscCall(VecGetArrayRead(xx, &x)); 301 PetscCall(VecGetArray(zz, &z)); 302 303 v = a->a; 304 xb = x; 305 306 for (i = 0; i < mbs; i++) { 307 n = ai[1] - ai[0]; /* length of i_th block row of A */ 308 x1 = xb[0]; 309 x2 = xb[1]; 310 ib = aj + *ai; 311 jmin = 0; 312 nonzerorow += (n > 0); 313 if (*ib == i) { /* (diag of A)*x */ 314 z[2 * i] += v[0] * x1 + v[2] * x2; 315 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 316 v += 4; 317 jmin++; 318 } 319 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321 for (j = jmin; j < n; j++) { 322 /* (strict lower triangular part of A)*x */ 323 cval = ib[j] * 2; 324 z[cval] += v[0] * x1 + v[1] * x2; 325 z[cval + 1] += v[2] * x1 + v[3] * x2; 326 /* (strict upper triangular part of A)*x */ 327 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 328 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 329 v += 4; 330 } 331 xb += 2; 332 ai++; 333 } 334 335 PetscCall(VecRestoreArrayRead(xx, &x)); 336 PetscCall(VecRestoreArray(zz, &z)); 337 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 342 { 343 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 344 PetscScalar *z, x1, x2, x3, zero = 0.0; 345 const PetscScalar *x, *xb; 346 const MatScalar *v; 347 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 348 const PetscInt *aj = a->j, *ai = a->i, *ib; 349 PetscInt nonzerorow = 0; 350 351 PetscFunctionBegin; 352 PetscCall(VecSet(zz, zero)); 353 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(zz, &z)); 356 357 v = a->a; 358 xb = x; 359 360 for (i = 0; i < mbs; i++) { 361 n = ai[1] - ai[0]; /* length of i_th block row of A */ 362 x1 = xb[0]; 363 x2 = xb[1]; 364 x3 = xb[2]; 365 ib = aj + *ai; 366 jmin = 0; 367 nonzerorow += (n > 0); 368 if (*ib == i) { /* (diag of A)*x */ 369 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 370 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 371 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 372 v += 9; 373 jmin++; 374 } 375 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 376 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 377 for (j = jmin; j < n; j++) { 378 /* (strict lower triangular part of A)*x */ 379 cval = ib[j] * 3; 380 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 381 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 382 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 383 /* (strict upper triangular part of A)*x */ 384 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 385 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 386 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 387 v += 9; 388 } 389 xb += 3; 390 ai++; 391 } 392 393 PetscCall(VecRestoreArrayRead(xx, &x)); 394 PetscCall(VecRestoreArray(zz, &z)); 395 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 400 { 401 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 402 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 403 const PetscScalar *x, *xb; 404 const MatScalar *v; 405 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 406 const PetscInt *aj = a->j, *ai = a->i, *ib; 407 PetscInt nonzerorow = 0; 408 409 PetscFunctionBegin; 410 PetscCall(VecSet(zz, zero)); 411 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 412 PetscCall(VecGetArrayRead(xx, &x)); 413 PetscCall(VecGetArray(zz, &z)); 414 415 v = a->a; 416 xb = x; 417 418 for (i = 0; i < mbs; i++) { 419 n = ai[1] - ai[0]; /* length of i_th block row of A */ 420 x1 = xb[0]; 421 x2 = xb[1]; 422 x3 = xb[2]; 423 x4 = xb[3]; 424 ib = aj + *ai; 425 jmin = 0; 426 nonzerorow += (n > 0); 427 if (*ib == i) { /* (diag of A)*x */ 428 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 429 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 430 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 431 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 432 v += 16; 433 jmin++; 434 } 435 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 436 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 437 for (j = jmin; j < n; j++) { 438 /* (strict lower triangular part of A)*x */ 439 cval = ib[j] * 4; 440 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 441 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 442 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 443 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 444 /* (strict upper triangular part of A)*x */ 445 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 446 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 447 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 448 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 449 v += 16; 450 } 451 xb += 4; 452 ai++; 453 } 454 455 PetscCall(VecRestoreArrayRead(xx, &x)); 456 PetscCall(VecRestoreArray(zz, &z)); 457 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 458 PetscFunctionReturn(PETSC_SUCCESS); 459 } 460 461 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 462 { 463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 464 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 465 const PetscScalar *x, *xb; 466 const MatScalar *v; 467 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 468 const PetscInt *aj = a->j, *ai = a->i, *ib; 469 PetscInt nonzerorow = 0; 470 471 PetscFunctionBegin; 472 PetscCall(VecSet(zz, zero)); 473 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 474 PetscCall(VecGetArrayRead(xx, &x)); 475 PetscCall(VecGetArray(zz, &z)); 476 477 v = a->a; 478 xb = x; 479 480 for (i = 0; i < mbs; i++) { 481 n = ai[1] - ai[0]; /* length of i_th block row of A */ 482 x1 = xb[0]; 483 x2 = xb[1]; 484 x3 = xb[2]; 485 x4 = xb[3]; 486 x5 = xb[4]; 487 ib = aj + *ai; 488 jmin = 0; 489 nonzerorow += (n > 0); 490 if (*ib == i) { /* (diag of A)*x */ 491 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 492 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 493 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 494 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 495 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 496 v += 25; 497 jmin++; 498 } 499 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 500 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 501 for (j = jmin; j < n; j++) { 502 /* (strict lower triangular part of A)*x */ 503 cval = ib[j] * 5; 504 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 505 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 506 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 507 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 508 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 509 /* (strict upper triangular part of A)*x */ 510 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 511 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 512 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 513 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 514 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 515 v += 25; 516 } 517 xb += 5; 518 ai++; 519 } 520 521 PetscCall(VecRestoreArrayRead(xx, &x)); 522 PetscCall(VecRestoreArray(zz, &z)); 523 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 524 PetscFunctionReturn(PETSC_SUCCESS); 525 } 526 527 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 528 { 529 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 530 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 531 const PetscScalar *x, *xb; 532 const MatScalar *v; 533 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 534 const PetscInt *aj = a->j, *ai = a->i, *ib; 535 PetscInt nonzerorow = 0; 536 537 PetscFunctionBegin; 538 PetscCall(VecSet(zz, zero)); 539 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 540 PetscCall(VecGetArrayRead(xx, &x)); 541 PetscCall(VecGetArray(zz, &z)); 542 543 v = a->a; 544 xb = x; 545 546 for (i = 0; i < mbs; i++) { 547 n = ai[1] - ai[0]; /* length of i_th block row of A */ 548 x1 = xb[0]; 549 x2 = xb[1]; 550 x3 = xb[2]; 551 x4 = xb[3]; 552 x5 = xb[4]; 553 x6 = xb[5]; 554 ib = aj + *ai; 555 jmin = 0; 556 nonzerorow += (n > 0); 557 if (*ib == i) { /* (diag of A)*x */ 558 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 559 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 560 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 561 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 562 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 563 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 564 v += 36; 565 jmin++; 566 } 567 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 568 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 569 for (j = jmin; j < n; j++) { 570 /* (strict lower triangular part of A)*x */ 571 cval = ib[j] * 6; 572 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 573 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 574 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 575 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 576 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 577 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 578 /* (strict upper triangular part of A)*x */ 579 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 580 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 581 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 582 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 583 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 584 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 585 v += 36; 586 } 587 xb += 6; 588 ai++; 589 } 590 591 PetscCall(VecRestoreArrayRead(xx, &x)); 592 PetscCall(VecRestoreArray(zz, &z)); 593 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 594 PetscFunctionReturn(PETSC_SUCCESS); 595 } 596 597 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 598 { 599 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 600 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 601 const PetscScalar *x, *xb; 602 const MatScalar *v; 603 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 604 const PetscInt *aj = a->j, *ai = a->i, *ib; 605 PetscInt nonzerorow = 0; 606 607 PetscFunctionBegin; 608 PetscCall(VecSet(zz, zero)); 609 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 610 PetscCall(VecGetArrayRead(xx, &x)); 611 PetscCall(VecGetArray(zz, &z)); 612 613 v = a->a; 614 xb = x; 615 616 for (i = 0; i < mbs; i++) { 617 n = ai[1] - ai[0]; /* length of i_th block row of A */ 618 x1 = xb[0]; 619 x2 = xb[1]; 620 x3 = xb[2]; 621 x4 = xb[3]; 622 x5 = xb[4]; 623 x6 = xb[5]; 624 x7 = xb[6]; 625 ib = aj + *ai; 626 jmin = 0; 627 nonzerorow += (n > 0); 628 if (*ib == i) { /* (diag of A)*x */ 629 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 630 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 631 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 632 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 633 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 634 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 635 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 636 v += 49; 637 jmin++; 638 } 639 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 640 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 641 for (j = jmin; j < n; j++) { 642 /* (strict lower triangular part of A)*x */ 643 cval = ib[j] * 7; 644 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 645 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 646 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 647 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 648 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 649 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 650 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 651 /* (strict upper triangular part of A)*x */ 652 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 653 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 654 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 655 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 656 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 657 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 658 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 659 v += 49; 660 } 661 xb += 7; 662 ai++; 663 } 664 PetscCall(VecRestoreArrayRead(xx, &x)); 665 PetscCall(VecRestoreArray(zz, &z)); 666 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 /* 671 This will not work with MatScalar == float because it calls the BLAS 672 */ 673 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 674 { 675 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 676 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 677 const PetscScalar *x, *x_ptr, *xb; 678 const MatScalar *v; 679 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 680 const PetscInt *idx, *aj, *ii; 681 PetscInt nonzerorow = 0; 682 683 PetscFunctionBegin; 684 PetscCall(VecSet(zz, zero)); 685 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 686 PetscCall(VecGetArrayRead(xx, &x)); 687 PetscCall(VecGetArray(zz, &z)); 688 689 x_ptr = x; 690 z_ptr = z; 691 692 aj = a->j; 693 v = a->a; 694 ii = a->i; 695 696 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 697 work = a->mult_work; 698 699 for (i = 0; i < mbs; i++) { 700 n = ii[1] - ii[0]; 701 ncols = n * bs; 702 workt = work; 703 idx = aj + ii[0]; 704 nonzerorow += (n > 0); 705 706 /* upper triangular part */ 707 for (j = 0; j < n; j++) { 708 xb = x_ptr + bs * (*idx++); 709 for (k = 0; k < bs; k++) workt[k] = xb[k]; 710 workt += bs; 711 } 712 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 713 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 714 715 /* strict lower triangular part */ 716 idx = aj + ii[0]; 717 if (n && *idx == i) { 718 ncols -= bs; 719 v += bs2; 720 idx++; 721 n--; 722 } 723 724 if (ncols > 0) { 725 workt = work; 726 PetscCall(PetscArrayzero(workt, ncols)); 727 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 728 for (j = 0; j < n; j++) { 729 zb = z_ptr + bs * (*idx++); 730 for (k = 0; k < bs; k++) zb[k] += workt[k]; 731 workt += bs; 732 } 733 } 734 x += bs; 735 v += n * bs2; 736 z += bs; 737 ii++; 738 } 739 740 PetscCall(VecRestoreArrayRead(xx, &x)); 741 PetscCall(VecRestoreArray(zz, &z)); 742 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 743 PetscFunctionReturn(PETSC_SUCCESS); 744 } 745 746 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 747 { 748 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 749 PetscScalar *z, x1; 750 const PetscScalar *x, *xb; 751 const MatScalar *v; 752 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 753 const PetscInt *aj = a->j, *ai = a->i, *ib; 754 PetscInt nonzerorow = 0; 755 #if defined(PETSC_USE_COMPLEX) 756 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 757 #else 758 const int aconj = 0; 759 #endif 760 761 PetscFunctionBegin; 762 PetscCall(VecCopy(yy, zz)); 763 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 764 PetscCall(VecGetArrayRead(xx, &x)); 765 PetscCall(VecGetArray(zz, &z)); 766 v = a->a; 767 xb = x; 768 769 for (i = 0; i < mbs; i++) { 770 n = ai[1] - ai[0]; /* length of i_th row of A */ 771 x1 = xb[0]; 772 ib = aj + *ai; 773 jmin = 0; 774 nonzerorow += (n > 0); 775 if (n && *ib == i) { /* (diag of A)*x */ 776 z[i] += *v++ * x[*ib++]; 777 jmin++; 778 } 779 if (aconj) { 780 for (j = jmin; j < n; j++) { 781 cval = *ib; 782 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 783 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 784 } 785 } else { 786 for (j = jmin; j < n; j++) { 787 cval = *ib; 788 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 789 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 790 } 791 } 792 xb++; 793 ai++; 794 } 795 796 PetscCall(VecRestoreArrayRead(xx, &x)); 797 PetscCall(VecRestoreArray(zz, &z)); 798 799 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 800 PetscFunctionReturn(PETSC_SUCCESS); 801 } 802 803 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 804 { 805 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 806 PetscScalar *z, x1, x2; 807 const PetscScalar *x, *xb; 808 const MatScalar *v; 809 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 810 const PetscInt *aj = a->j, *ai = a->i, *ib; 811 PetscInt nonzerorow = 0; 812 813 PetscFunctionBegin; 814 PetscCall(VecCopy(yy, zz)); 815 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 816 PetscCall(VecGetArrayRead(xx, &x)); 817 PetscCall(VecGetArray(zz, &z)); 818 819 v = a->a; 820 xb = x; 821 822 for (i = 0; i < mbs; i++) { 823 n = ai[1] - ai[0]; /* length of i_th block row of A */ 824 x1 = xb[0]; 825 x2 = xb[1]; 826 ib = aj + *ai; 827 jmin = 0; 828 nonzerorow += (n > 0); 829 if (n && *ib == i) { /* (diag of A)*x */ 830 z[2 * i] += v[0] * x1 + v[2] * x2; 831 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 832 v += 4; 833 jmin++; 834 } 835 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 836 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 837 for (j = jmin; j < n; j++) { 838 /* (strict lower triangular part of A)*x */ 839 cval = ib[j] * 2; 840 z[cval] += v[0] * x1 + v[1] * x2; 841 z[cval + 1] += v[2] * x1 + v[3] * x2; 842 /* (strict upper triangular part of A)*x */ 843 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 844 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 845 v += 4; 846 } 847 xb += 2; 848 ai++; 849 } 850 PetscCall(VecRestoreArrayRead(xx, &x)); 851 PetscCall(VecRestoreArray(zz, &z)); 852 853 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 854 PetscFunctionReturn(PETSC_SUCCESS); 855 } 856 857 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 858 { 859 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 860 PetscScalar *z, x1, x2, x3; 861 const PetscScalar *x, *xb; 862 const MatScalar *v; 863 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 864 const PetscInt *aj = a->j, *ai = a->i, *ib; 865 PetscInt nonzerorow = 0; 866 867 PetscFunctionBegin; 868 PetscCall(VecCopy(yy, zz)); 869 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 870 PetscCall(VecGetArrayRead(xx, &x)); 871 PetscCall(VecGetArray(zz, &z)); 872 873 v = a->a; 874 xb = x; 875 876 for (i = 0; i < mbs; i++) { 877 n = ai[1] - ai[0]; /* length of i_th block row of A */ 878 x1 = xb[0]; 879 x2 = xb[1]; 880 x3 = xb[2]; 881 ib = aj + *ai; 882 jmin = 0; 883 nonzerorow += (n > 0); 884 if (n && *ib == i) { /* (diag of A)*x */ 885 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 886 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 887 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 888 v += 9; 889 jmin++; 890 } 891 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 892 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 893 for (j = jmin; j < n; j++) { 894 /* (strict lower triangular part of A)*x */ 895 cval = ib[j] * 3; 896 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 897 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 898 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 899 /* (strict upper triangular part of A)*x */ 900 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 901 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 902 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 903 v += 9; 904 } 905 xb += 3; 906 ai++; 907 } 908 909 PetscCall(VecRestoreArrayRead(xx, &x)); 910 PetscCall(VecRestoreArray(zz, &z)); 911 912 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 913 PetscFunctionReturn(PETSC_SUCCESS); 914 } 915 916 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 917 { 918 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 919 PetscScalar *z, x1, x2, x3, x4; 920 const PetscScalar *x, *xb; 921 const MatScalar *v; 922 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 923 const PetscInt *aj = a->j, *ai = a->i, *ib; 924 PetscInt nonzerorow = 0; 925 926 PetscFunctionBegin; 927 PetscCall(VecCopy(yy, zz)); 928 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 929 PetscCall(VecGetArrayRead(xx, &x)); 930 PetscCall(VecGetArray(zz, &z)); 931 932 v = a->a; 933 xb = x; 934 935 for (i = 0; i < mbs; i++) { 936 n = ai[1] - ai[0]; /* length of i_th block row of A */ 937 x1 = xb[0]; 938 x2 = xb[1]; 939 x3 = xb[2]; 940 x4 = xb[3]; 941 ib = aj + *ai; 942 jmin = 0; 943 nonzerorow += (n > 0); 944 if (n && *ib == i) { /* (diag of A)*x */ 945 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 946 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 947 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 948 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 949 v += 16; 950 jmin++; 951 } 952 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 953 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 954 for (j = jmin; j < n; j++) { 955 /* (strict lower triangular part of A)*x */ 956 cval = ib[j] * 4; 957 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 958 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 959 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 960 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 961 /* (strict upper triangular part of A)*x */ 962 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 963 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 964 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 965 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 966 v += 16; 967 } 968 xb += 4; 969 ai++; 970 } 971 972 PetscCall(VecRestoreArrayRead(xx, &x)); 973 PetscCall(VecRestoreArray(zz, &z)); 974 975 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 976 PetscFunctionReturn(PETSC_SUCCESS); 977 } 978 979 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 980 { 981 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 982 PetscScalar *z, x1, x2, x3, x4, x5; 983 const PetscScalar *x, *xb; 984 const MatScalar *v; 985 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 986 const PetscInt *aj = a->j, *ai = a->i, *ib; 987 PetscInt nonzerorow = 0; 988 989 PetscFunctionBegin; 990 PetscCall(VecCopy(yy, zz)); 991 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 992 PetscCall(VecGetArrayRead(xx, &x)); 993 PetscCall(VecGetArray(zz, &z)); 994 995 v = a->a; 996 xb = x; 997 998 for (i = 0; i < mbs; i++) { 999 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1000 x1 = xb[0]; 1001 x2 = xb[1]; 1002 x3 = xb[2]; 1003 x4 = xb[3]; 1004 x5 = xb[4]; 1005 ib = aj + *ai; 1006 jmin = 0; 1007 nonzerorow += (n > 0); 1008 if (n && *ib == i) { /* (diag of A)*x */ 1009 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1010 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1011 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1012 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 1013 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1014 v += 25; 1015 jmin++; 1016 } 1017 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1018 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1019 for (j = jmin; j < n; j++) { 1020 /* (strict lower triangular part of A)*x */ 1021 cval = ib[j] * 5; 1022 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 1023 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1024 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1025 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1026 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1027 /* (strict upper triangular part of A)*x */ 1028 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1029 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1030 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1031 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1032 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1033 v += 25; 1034 } 1035 xb += 5; 1036 ai++; 1037 } 1038 1039 PetscCall(VecRestoreArrayRead(xx, &x)); 1040 PetscCall(VecRestoreArray(zz, &z)); 1041 1042 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1047 { 1048 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1049 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1050 const PetscScalar *x, *xb; 1051 const MatScalar *v; 1052 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1053 const PetscInt *aj = a->j, *ai = a->i, *ib; 1054 PetscInt nonzerorow = 0; 1055 1056 PetscFunctionBegin; 1057 PetscCall(VecCopy(yy, zz)); 1058 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1059 PetscCall(VecGetArrayRead(xx, &x)); 1060 PetscCall(VecGetArray(zz, &z)); 1061 1062 v = a->a; 1063 xb = x; 1064 1065 for (i = 0; i < mbs; i++) { 1066 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1067 x1 = xb[0]; 1068 x2 = xb[1]; 1069 x3 = xb[2]; 1070 x4 = xb[3]; 1071 x5 = xb[4]; 1072 x6 = xb[5]; 1073 ib = aj + *ai; 1074 jmin = 0; 1075 nonzerorow += (n > 0); 1076 if (n && *ib == i) { /* (diag of A)*x */ 1077 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1078 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1079 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1080 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1081 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1082 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1083 v += 36; 1084 jmin++; 1085 } 1086 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1087 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1088 for (j = jmin; j < n; j++) { 1089 /* (strict lower triangular part of A)*x */ 1090 cval = ib[j] * 6; 1091 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1092 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1093 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1094 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1095 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1096 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1097 /* (strict upper triangular part of A)*x */ 1098 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1099 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1100 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1101 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1102 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1103 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1104 v += 36; 1105 } 1106 xb += 6; 1107 ai++; 1108 } 1109 1110 PetscCall(VecRestoreArrayRead(xx, &x)); 1111 PetscCall(VecRestoreArray(zz, &z)); 1112 1113 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1114 PetscFunctionReturn(PETSC_SUCCESS); 1115 } 1116 1117 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1118 { 1119 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1120 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1121 const PetscScalar *x, *xb; 1122 const MatScalar *v; 1123 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1124 const PetscInt *aj = a->j, *ai = a->i, *ib; 1125 PetscInt nonzerorow = 0; 1126 1127 PetscFunctionBegin; 1128 PetscCall(VecCopy(yy, zz)); 1129 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1130 PetscCall(VecGetArrayRead(xx, &x)); 1131 PetscCall(VecGetArray(zz, &z)); 1132 1133 v = a->a; 1134 xb = x; 1135 1136 for (i = 0; i < mbs; i++) { 1137 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1138 x1 = xb[0]; 1139 x2 = xb[1]; 1140 x3 = xb[2]; 1141 x4 = xb[3]; 1142 x5 = xb[4]; 1143 x6 = xb[5]; 1144 x7 = xb[6]; 1145 ib = aj + *ai; 1146 jmin = 0; 1147 nonzerorow += (n > 0); 1148 if (n && *ib == i) { /* (diag of A)*x */ 1149 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1150 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1151 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1152 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1153 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1154 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1155 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1156 v += 49; 1157 jmin++; 1158 } 1159 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1160 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1161 for (j = jmin; j < n; j++) { 1162 /* (strict lower triangular part of A)*x */ 1163 cval = ib[j] * 7; 1164 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1165 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1166 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1167 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1168 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1169 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1170 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1171 /* (strict upper triangular part of A)*x */ 1172 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1173 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1174 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1175 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1176 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1177 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1178 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1179 v += 49; 1180 } 1181 xb += 7; 1182 ai++; 1183 } 1184 1185 PetscCall(VecRestoreArrayRead(xx, &x)); 1186 PetscCall(VecRestoreArray(zz, &z)); 1187 1188 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1189 PetscFunctionReturn(PETSC_SUCCESS); 1190 } 1191 1192 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1193 { 1194 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1195 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1196 const PetscScalar *x, *x_ptr, *xb; 1197 const MatScalar *v; 1198 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1199 const PetscInt *idx, *aj, *ii; 1200 PetscInt nonzerorow = 0; 1201 1202 PetscFunctionBegin; 1203 PetscCall(VecCopy(yy, zz)); 1204 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1205 PetscCall(VecGetArrayRead(xx, &x)); 1206 x_ptr = x; 1207 PetscCall(VecGetArray(zz, &z)); 1208 z_ptr = z; 1209 1210 aj = a->j; 1211 v = a->a; 1212 ii = a->i; 1213 1214 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1215 work = a->mult_work; 1216 1217 for (i = 0; i < mbs; i++) { 1218 n = ii[1] - ii[0]; 1219 ncols = n * bs; 1220 workt = work; 1221 idx = aj + ii[0]; 1222 nonzerorow += (n > 0); 1223 1224 /* upper triangular part */ 1225 for (j = 0; j < n; j++) { 1226 xb = x_ptr + bs * (*idx++); 1227 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1228 workt += bs; 1229 } 1230 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1231 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1232 1233 /* strict lower triangular part */ 1234 idx = aj + ii[0]; 1235 if (n && *idx == i) { 1236 ncols -= bs; 1237 v += bs2; 1238 idx++; 1239 n--; 1240 } 1241 if (ncols > 0) { 1242 workt = work; 1243 PetscCall(PetscArrayzero(workt, ncols)); 1244 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1245 for (j = 0; j < n; j++) { 1246 zb = z_ptr + bs * (*idx++); 1247 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1248 workt += bs; 1249 } 1250 } 1251 1252 x += bs; 1253 v += n * bs2; 1254 z += bs; 1255 ii++; 1256 } 1257 1258 PetscCall(VecRestoreArrayRead(xx, &x)); 1259 PetscCall(VecRestoreArray(zz, &z)); 1260 1261 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1262 PetscFunctionReturn(PETSC_SUCCESS); 1263 } 1264 1265 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1266 { 1267 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1268 PetscScalar oalpha = alpha; 1269 PetscBLASInt one = 1, totalnz; 1270 1271 PetscFunctionBegin; 1272 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1273 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1274 PetscCall(PetscLogFlops(totalnz)); 1275 PetscFunctionReturn(PETSC_SUCCESS); 1276 } 1277 1278 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1279 { 1280 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1281 const MatScalar *v = a->a; 1282 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1283 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1284 const PetscInt *aj = a->j, *col; 1285 1286 PetscFunctionBegin; 1287 if (!a->nz) { 1288 *norm = 0.0; 1289 PetscFunctionReturn(PETSC_SUCCESS); 1290 } 1291 if (type == NORM_FROBENIUS) { 1292 for (k = 0; k < mbs; k++) { 1293 jmin = a->i[k]; 1294 jmax = a->i[k + 1]; 1295 col = aj + jmin; 1296 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1297 for (i = 0; i < bs2; i++) { 1298 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1299 v++; 1300 } 1301 jmin++; 1302 } 1303 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1304 for (i = 0; i < bs2; i++) { 1305 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1306 v++; 1307 } 1308 } 1309 } 1310 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1311 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1312 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1313 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1314 for (i = 0; i < mbs; i++) jl[i] = mbs; 1315 il[0] = 0; 1316 1317 *norm = 0.0; 1318 for (k = 0; k < mbs; k++) { /* k_th block row */ 1319 for (j = 0; j < bs; j++) sum[j] = 0.0; 1320 /*-- col sum --*/ 1321 i = jl[k]; /* first |A(i,k)| to be added */ 1322 /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1323 at step k */ 1324 while (i < mbs) { 1325 nexti = jl[i]; /* next block row to be added */ 1326 ik = il[i]; /* block index of A(i,k) in the array a */ 1327 for (j = 0; j < bs; j++) { 1328 v = a->a + ik * bs2 + j * bs; 1329 for (k1 = 0; k1 < bs; k1++) { 1330 sum[j] += PetscAbsScalar(*v); 1331 v++; 1332 } 1333 } 1334 /* update il, jl */ 1335 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1336 jmax = a->i[i + 1]; 1337 if (jmin < jmax) { 1338 il[i] = jmin; 1339 j = a->j[jmin]; 1340 jl[i] = jl[j]; 1341 jl[j] = i; 1342 } 1343 i = nexti; 1344 } 1345 /*-- row sum --*/ 1346 jmin = a->i[k]; 1347 jmax = a->i[k + 1]; 1348 for (i = jmin; i < jmax; i++) { 1349 for (j = 0; j < bs; j++) { 1350 v = a->a + i * bs2 + j; 1351 for (k1 = 0; k1 < bs; k1++) { 1352 sum[j] += PetscAbsScalar(*v); 1353 v += bs; 1354 } 1355 } 1356 } 1357 /* add k_th block row to il, jl */ 1358 col = aj + jmin; 1359 if (jmax - jmin > 0 && *col == k) jmin++; 1360 if (jmin < jmax) { 1361 il[k] = jmin; 1362 j = a->j[jmin]; 1363 jl[k] = jl[j]; 1364 jl[j] = k; 1365 } 1366 for (j = 0; j < bs; j++) { 1367 if (sum[j] > *norm) *norm = sum[j]; 1368 } 1369 } 1370 PetscCall(PetscFree3(sum, il, jl)); 1371 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1372 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1373 PetscFunctionReturn(PETSC_SUCCESS); 1374 } 1375 1376 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1377 { 1378 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1379 1380 PetscFunctionBegin; 1381 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1382 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1383 *flg = PETSC_FALSE; 1384 PetscFunctionReturn(PETSC_SUCCESS); 1385 } 1386 1387 /* if the a->i are the same */ 1388 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1389 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1390 1391 /* if a->j are the same */ 1392 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1393 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1394 1395 /* if a->a are the same */ 1396 PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg)); 1397 PetscFunctionReturn(PETSC_SUCCESS); 1398 } 1399 1400 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1401 { 1402 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1403 PetscInt n; 1404 const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2; 1405 PetscScalar *x; 1406 const MatScalar *aa = a->a, *aa_j; 1407 const PetscInt *ai = a->i, *adiag; 1408 PetscBool diagDense; 1409 1410 PetscFunctionBegin; 1411 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1412 PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense)); 1413 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1414 PetscCall(VecGetArrayWrite(v, &x)); 1415 for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]]; 1416 PetscCall(VecRestoreArrayWrite(v, &x)); 1417 PetscFunctionReturn(PETSC_SUCCESS); 1418 } 1419 1420 PetscCall(VecGetLocalSize(v, &n)); 1421 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1422 PetscCall(VecGetArrayWrite(v, &x)); 1423 1424 if (diagDense) { 1425 for (PetscInt i = 0, row = 0; i < ambs; i++) { 1426 aa_j = aa + adiag[i] * bs2; 1427 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k]; 1428 } 1429 } else { 1430 for (PetscInt i = 0, row = 0; i < ambs; i++) { 1431 const PetscInt j = adiag[i]; 1432 1433 if (j != ai[i + 1]) { 1434 aa_j = aa + j * bs2; 1435 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k]; 1436 } else { 1437 for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0; 1438 } 1439 } 1440 } 1441 PetscCall(VecRestoreArrayWrite(v, &x)); 1442 PetscFunctionReturn(PETSC_SUCCESS); 1443 } 1444 1445 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1446 { 1447 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1448 PetscScalar x; 1449 const PetscScalar *l, *li, *ri; 1450 MatScalar *aa, *v; 1451 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1452 const PetscInt *ai, *aj; 1453 PetscBool flg; 1454 1455 PetscFunctionBegin; 1456 if (ll != rr) { 1457 PetscCall(VecEqual(ll, rr, &flg)); 1458 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1459 } 1460 if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 1461 ai = a->i; 1462 aj = a->j; 1463 aa = a->a; 1464 m = A->rmap->N; 1465 bs = A->rmap->bs; 1466 mbs = a->mbs; 1467 bs2 = a->bs2; 1468 1469 PetscCall(VecGetArrayRead(ll, &l)); 1470 PetscCall(VecGetLocalSize(ll, &lm)); 1471 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1472 for (i = 0; i < mbs; i++) { /* for each block row */ 1473 M = ai[i + 1] - ai[i]; 1474 li = l + i * bs; 1475 v = aa + bs2 * ai[i]; 1476 for (j = 0; j < M; j++) { /* for each block */ 1477 ri = l + bs * aj[ai[i] + j]; 1478 for (k = 0; k < bs; k++) { 1479 x = ri[k]; 1480 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1481 } 1482 } 1483 } 1484 PetscCall(VecRestoreArrayRead(ll, &l)); 1485 PetscCall(PetscLogFlops(2.0 * a->nz)); 1486 PetscFunctionReturn(PETSC_SUCCESS); 1487 } 1488 1489 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1490 { 1491 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1492 1493 PetscFunctionBegin; 1494 info->block_size = a->bs2; 1495 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1496 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1497 info->nz_unneeded = info->nz_allocated - info->nz_used; 1498 info->assemblies = A->num_ass; 1499 info->mallocs = A->info.mallocs; 1500 info->memory = 0; /* REVIEW ME */ 1501 if (A->factortype) { 1502 info->fill_ratio_given = A->info.fill_ratio_given; 1503 info->fill_ratio_needed = A->info.fill_ratio_needed; 1504 info->factor_mallocs = A->info.factor_mallocs; 1505 } else { 1506 info->fill_ratio_given = 0; 1507 info->fill_ratio_needed = 0; 1508 info->factor_mallocs = 0; 1509 } 1510 PetscFunctionReturn(PETSC_SUCCESS); 1511 } 1512 1513 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1514 { 1515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1516 1517 PetscFunctionBegin; 1518 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1519 PetscFunctionReturn(PETSC_SUCCESS); 1520 } 1521 1522 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1523 { 1524 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1525 PetscInt i, j, n, row, col, bs, mbs; 1526 const PetscInt *ai, *aj; 1527 PetscReal atmp; 1528 const MatScalar *aa; 1529 PetscScalar *x; 1530 PetscInt ncols, brow, bcol, krow, kcol; 1531 1532 PetscFunctionBegin; 1533 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1534 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1535 bs = A->rmap->bs; 1536 aa = a->a; 1537 ai = a->i; 1538 aj = a->j; 1539 mbs = a->mbs; 1540 1541 PetscCall(VecSet(v, 0.0)); 1542 PetscCall(VecGetArray(v, &x)); 1543 PetscCall(VecGetLocalSize(v, &n)); 1544 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1545 for (i = 0; i < mbs; i++) { 1546 ncols = ai[1] - ai[0]; 1547 ai++; 1548 brow = bs * i; 1549 for (j = 0; j < ncols; j++) { 1550 bcol = bs * (*aj); 1551 for (kcol = 0; kcol < bs; kcol++) { 1552 col = bcol + kcol; /* col index */ 1553 for (krow = 0; krow < bs; krow++) { 1554 atmp = PetscAbsScalar(*aa); 1555 aa++; 1556 row = brow + krow; /* row index */ 1557 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1558 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1559 } 1560 } 1561 aj++; 1562 } 1563 } 1564 PetscCall(VecRestoreArray(v, &x)); 1565 PetscFunctionReturn(PETSC_SUCCESS); 1566 } 1567 1568 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1569 { 1570 PetscFunctionBegin; 1571 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1572 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1573 PetscFunctionReturn(PETSC_SUCCESS); 1574 } 1575 1576 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1577 { 1578 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1579 PetscScalar *z = c; 1580 const PetscScalar *xb; 1581 PetscScalar x1; 1582 const MatScalar *v = a->a, *vv; 1583 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1584 #if defined(PETSC_USE_COMPLEX) 1585 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1586 #else 1587 const int aconj = 0; 1588 #endif 1589 1590 PetscFunctionBegin; 1591 for (i = 0; i < mbs; i++) { 1592 n = ii[1] - ii[0]; 1593 ii++; 1594 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1595 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1596 jj = idx; 1597 vv = v; 1598 for (k = 0; k < cn; k++) { 1599 idx = jj; 1600 v = vv; 1601 for (j = 0; j < n; j++) { 1602 xb = b + (*idx); 1603 x1 = xb[0 + k * bm]; 1604 z[0 + k * cm] += v[0] * x1; 1605 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1606 v += 1; 1607 ++idx; 1608 } 1609 } 1610 z += 1; 1611 } 1612 PetscFunctionReturn(PETSC_SUCCESS); 1613 } 1614 1615 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1616 { 1617 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1618 PetscScalar *z = c; 1619 const PetscScalar *xb; 1620 PetscScalar x1, x2; 1621 const MatScalar *v = a->a, *vv; 1622 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1623 1624 PetscFunctionBegin; 1625 for (i = 0; i < mbs; i++) { 1626 n = ii[1] - ii[0]; 1627 ii++; 1628 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1629 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1630 jj = idx; 1631 vv = v; 1632 for (k = 0; k < cn; k++) { 1633 idx = jj; 1634 v = vv; 1635 for (j = 0; j < n; j++) { 1636 xb = b + 2 * (*idx); 1637 x1 = xb[0 + k * bm]; 1638 x2 = xb[1 + k * bm]; 1639 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1640 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1641 if (*idx != i) { 1642 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1643 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1644 } 1645 v += 4; 1646 ++idx; 1647 } 1648 } 1649 z += 2; 1650 } 1651 PetscFunctionReturn(PETSC_SUCCESS); 1652 } 1653 1654 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1655 { 1656 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1657 PetscScalar *z = c; 1658 const PetscScalar *xb; 1659 PetscScalar x1, x2, x3; 1660 const MatScalar *v = a->a, *vv; 1661 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1662 1663 PetscFunctionBegin; 1664 for (i = 0; i < mbs; i++) { 1665 n = ii[1] - ii[0]; 1666 ii++; 1667 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1668 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1669 jj = idx; 1670 vv = v; 1671 for (k = 0; k < cn; k++) { 1672 idx = jj; 1673 v = vv; 1674 for (j = 0; j < n; j++) { 1675 xb = b + 3 * (*idx); 1676 x1 = xb[0 + k * bm]; 1677 x2 = xb[1 + k * bm]; 1678 x3 = xb[2 + k * bm]; 1679 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1680 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1681 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1682 if (*idx != i) { 1683 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1684 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1685 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1686 } 1687 v += 9; 1688 ++idx; 1689 } 1690 } 1691 z += 3; 1692 } 1693 PetscFunctionReturn(PETSC_SUCCESS); 1694 } 1695 1696 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1697 { 1698 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1699 PetscScalar *z = c; 1700 const PetscScalar *xb; 1701 PetscScalar x1, x2, x3, x4; 1702 const MatScalar *v = a->a, *vv; 1703 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1704 1705 PetscFunctionBegin; 1706 for (i = 0; i < mbs; i++) { 1707 n = ii[1] - ii[0]; 1708 ii++; 1709 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1710 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1711 jj = idx; 1712 vv = v; 1713 for (k = 0; k < cn; k++) { 1714 idx = jj; 1715 v = vv; 1716 for (j = 0; j < n; j++) { 1717 xb = b + 4 * (*idx); 1718 x1 = xb[0 + k * bm]; 1719 x2 = xb[1 + k * bm]; 1720 x3 = xb[2 + k * bm]; 1721 x4 = xb[3 + k * bm]; 1722 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1723 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1724 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1725 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1726 if (*idx != i) { 1727 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1728 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1729 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1730 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1731 } 1732 v += 16; 1733 ++idx; 1734 } 1735 } 1736 z += 4; 1737 } 1738 PetscFunctionReturn(PETSC_SUCCESS); 1739 } 1740 1741 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1742 { 1743 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1744 PetscScalar *z = c; 1745 const PetscScalar *xb; 1746 PetscScalar x1, x2, x3, x4, x5; 1747 const MatScalar *v = a->a, *vv; 1748 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1749 1750 PetscFunctionBegin; 1751 for (i = 0; i < mbs; i++) { 1752 n = ii[1] - ii[0]; 1753 ii++; 1754 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1755 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1756 jj = idx; 1757 vv = v; 1758 for (k = 0; k < cn; k++) { 1759 idx = jj; 1760 v = vv; 1761 for (j = 0; j < n; j++) { 1762 xb = b + 5 * (*idx); 1763 x1 = xb[0 + k * bm]; 1764 x2 = xb[1 + k * bm]; 1765 x3 = xb[2 + k * bm]; 1766 x4 = xb[3 + k * bm]; 1767 x5 = xb[4 + k * cm]; 1768 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1769 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1770 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1771 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1772 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1773 if (*idx != i) { 1774 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1775 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1776 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1777 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1778 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1779 } 1780 v += 25; 1781 ++idx; 1782 } 1783 } 1784 z += 5; 1785 } 1786 PetscFunctionReturn(PETSC_SUCCESS); 1787 } 1788 1789 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1790 { 1791 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1792 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1793 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1794 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1795 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1796 PetscBLASInt bbs, bcn, bbm, bcm; 1797 PetscScalar *z = NULL; 1798 PetscScalar *c, *b; 1799 const MatScalar *v; 1800 const PetscInt *idx, *ii; 1801 PetscScalar _DOne = 1.0; 1802 1803 PetscFunctionBegin; 1804 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1805 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1806 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1807 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1808 b = bd->v; 1809 PetscCall(MatZeroEntries(C)); 1810 PetscCall(MatDenseGetArray(C, &c)); 1811 switch (bs) { 1812 case 1: 1813 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1814 break; 1815 case 2: 1816 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1817 break; 1818 case 3: 1819 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1820 break; 1821 case 4: 1822 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1823 break; 1824 case 5: 1825 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1826 break; 1827 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1828 PetscCall(PetscBLASIntCast(bs, &bbs)); 1829 PetscCall(PetscBLASIntCast(cn, &bcn)); 1830 PetscCall(PetscBLASIntCast(bm, &bbm)); 1831 PetscCall(PetscBLASIntCast(cm, &bcm)); 1832 idx = a->j; 1833 v = a->a; 1834 mbs = a->mbs; 1835 ii = a->i; 1836 z = c; 1837 for (i = 0; i < mbs; i++) { 1838 n = ii[1] - ii[0]; 1839 ii++; 1840 for (j = 0; j < n; j++) { 1841 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1842 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1843 v += bs2; 1844 } 1845 z += bs; 1846 } 1847 } 1848 PetscCall(MatDenseRestoreArray(C, &c)); 1849 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1850 PetscFunctionReturn(PETSC_SUCCESS); 1851 } 1852