1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <../src/mat/impls/sbaij/seq/sbaij.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 9 { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 11 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 12 const PetscInt *idx; 13 PetscBT table_out, table_in; 14 15 PetscFunctionBegin; 16 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17 mbs = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 PetscCall(PetscBTCreate(mbs, &table_out)); 22 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 23 PetscCall(PetscBTCreate(mbs, &table_in)); 24 25 for (i = 0; i < is_max; i++) { /* for each is */ 26 isz = 0; 27 PetscCall(PetscBTMemzero(mbs, table_out)); 28 29 /* Extract the indices, assume there can be duplicate entries */ 30 PetscCall(ISGetIndices(is[i], &idx)); 31 PetscCall(ISGetLocalSize(is[i], &n)); 32 33 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 34 bcol_max = 0; 35 for (j = 0; j < n; ++j) { 36 brow = idx[j] / bs; /* convert the indices into block indices */ 37 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 38 if (!PetscBTLookupSet(table_out, brow)) { 39 nidx[isz++] = brow; 40 if (bcol_max < brow) bcol_max = brow; 41 } 42 } 43 PetscCall(ISRestoreIndices(is[i], &idx)); 44 PetscCall(ISDestroy(&is[i])); 45 46 k = 0; 47 for (j = 0; j < ov; j++) { /* for each overlap */ 48 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 49 PetscCall(PetscBTMemzero(mbs, table_in)); 50 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 51 52 n = isz; /* length of the updated is[i] */ 53 for (brow = 0; brow < mbs; brow++) { 54 start = ai[brow]; 55 end = ai[brow + 1]; 56 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 57 for (l = start; l < end; l++) { 58 bcol = aj[l]; 59 if (!PetscBTLookupSet(table_out, bcol)) { 60 nidx[isz++] = bcol; 61 if (bcol_max < bcol) bcol_max = bcol; 62 } 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l < end; l++) { 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table_in, bcol)) { 71 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 79 } /* for each is */ 80 PetscCall(PetscBTDestroy(&table_out)); 81 PetscCall(PetscFree(nidx)); 82 PetscCall(PetscBTDestroy(&table_in)); 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 87 Zero some ops' to avoid invalid use */ 88 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 89 { 90 PetscFunctionBegin; 91 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 92 Bseq->ops->mult = NULL; 93 Bseq->ops->multadd = NULL; 94 Bseq->ops->multtranspose = NULL; 95 Bseq->ops->multtransposeadd = NULL; 96 Bseq->ops->lufactor = NULL; 97 Bseq->ops->choleskyfactor = NULL; 98 Bseq->ops->lufactorsymbolic = NULL; 99 Bseq->ops->choleskyfactorsymbolic = NULL; 100 Bseq->ops->getinertia = NULL; 101 PetscFunctionReturn(PETSC_SUCCESS); 102 } 103 104 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 105 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 106 { 107 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 108 Mat_SeqBAIJ *d = NULL; 109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111 const PetscInt *irow, *icol; 112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113 PetscInt *aj = a->j, *ai = a->i; 114 MatScalar *mat_a; 115 Mat C; 116 PetscBool flag; 117 118 PetscFunctionBegin; 119 120 PetscCall(ISGetIndices(isrow, &irow)); 121 PetscCall(ISGetIndices(iscol, &icol)); 122 PetscCall(ISGetLocalSize(isrow, &nrows)); 123 PetscCall(ISGetLocalSize(iscol, &ncols)); 124 125 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126 ssmap = smap; 127 PetscCall(PetscMalloc1(1 + nrows, &lens)); 128 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129 /* determine lens of each row */ 130 for (i = 0; i < nrows; i++) { 131 kstart = ai[irow[i]]; 132 kend = kstart + a->ilen[irow[i]]; 133 lens[i] = 0; 134 for (k = kstart; k < kend; k++) { 135 if (ssmap[aj[k]]) lens[i]++; 136 } 137 } 138 /* Create and fill new matrix */ 139 if (scall == MAT_REUSE_MATRIX) { 140 if (sym) { 141 c = (Mat_SeqSBAIJ *)((*B)->data); 142 143 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 144 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 145 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 146 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 147 } else { 148 d = (Mat_SeqBAIJ *)((*B)->data); 149 150 PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 151 PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 152 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 153 PetscCall(PetscArrayzero(d->ilen, d->mbs)); 154 } 155 C = *B; 156 } else { 157 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 158 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 159 if (sym) { 160 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 161 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 162 } else { 163 PetscCall(MatSetType(C, MATSEQBAIJ)); 164 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 165 } 166 } 167 if (sym) c = (Mat_SeqSBAIJ *)(C->data); 168 else d = (Mat_SeqBAIJ *)(C->data); 169 for (i = 0; i < nrows; i++) { 170 row = irow[i]; 171 kstart = ai[row]; 172 kend = kstart + a->ilen[row]; 173 if (sym) { 174 mat_i = c->i[i]; 175 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 176 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 177 mat_ilen = c->ilen + i; 178 } else { 179 mat_i = d->i[i]; 180 mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 181 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 182 mat_ilen = d->ilen + i; 183 } 184 for (k = kstart; k < kend; k++) { 185 if ((tcol = ssmap[a->j[k]])) { 186 *mat_j++ = tcol - 1; 187 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 188 mat_a += bs2; 189 (*mat_ilen)++; 190 } 191 } 192 } 193 /* sort */ 194 { 195 MatScalar *work; 196 197 PetscCall(PetscMalloc1(bs2, &work)); 198 for (i = 0; i < nrows; i++) { 199 PetscInt ilen; 200 if (sym) { 201 mat_i = c->i[i]; 202 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 203 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 204 ilen = c->ilen[i]; 205 } else { 206 mat_i = d->i[i]; 207 mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 208 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 209 ilen = d->ilen[i]; 210 } 211 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 212 } 213 PetscCall(PetscFree(work)); 214 } 215 216 /* Free work space */ 217 PetscCall(ISRestoreIndices(iscol, &icol)); 218 PetscCall(PetscFree(smap)); 219 PetscCall(PetscFree(lens)); 220 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 221 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 222 223 PetscCall(ISRestoreIndices(isrow, &irow)); 224 *B = C; 225 PetscFunctionReturn(PETSC_SUCCESS); 226 } 227 228 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 229 { 230 Mat C[2]; 231 IS is1, is2, intersect = NULL; 232 PetscInt n1, n2, ni; 233 PetscBool sym = PETSC_TRUE; 234 235 PetscFunctionBegin; 236 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 237 if (isrow == iscol) { 238 is2 = is1; 239 PetscCall(PetscObjectReference((PetscObject)is2)); 240 } else { 241 PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 242 PetscCall(ISIntersect(is1, is2, &intersect)); 243 PetscCall(ISGetLocalSize(intersect, &ni)); 244 PetscCall(ISDestroy(&intersect)); 245 if (ni == 0) sym = PETSC_FALSE; 246 else if (PetscDefined(USE_DEBUG)) { 247 PetscCall(ISGetLocalSize(is1, &n1)); 248 PetscCall(ISGetLocalSize(is2, &n2)); 249 PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 250 } 251 } 252 if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 253 else { 254 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 255 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 256 PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 257 PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 258 PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 259 if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 260 else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 261 else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 262 PetscCall(MatDestroy(C)); 263 PetscCall(MatDestroy(C + 1)); 264 } 265 PetscCall(ISDestroy(&is1)); 266 PetscCall(ISDestroy(&is2)); 267 268 if (sym && isrow != iscol) { 269 PetscBool isequal; 270 PetscCall(ISEqual(isrow, iscol, &isequal)); 271 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 272 } 273 PetscFunctionReturn(PETSC_SUCCESS); 274 } 275 276 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 277 { 278 PetscInt i; 279 280 PetscFunctionBegin; 281 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 282 283 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 284 PetscFunctionReturn(PETSC_SUCCESS); 285 } 286 287 /* Should check that shapes of vectors and matrices match */ 288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 289 { 290 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 291 PetscScalar *z, x1, x2, zero = 0.0; 292 const PetscScalar *x, *xb; 293 const MatScalar *v; 294 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 295 const PetscInt *aj = a->j, *ai = a->i, *ib; 296 PetscInt nonzerorow = 0; 297 298 PetscFunctionBegin; 299 PetscCall(VecSet(zz, zero)); 300 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 301 PetscCall(VecGetArrayRead(xx, &x)); 302 PetscCall(VecGetArray(zz, &z)); 303 304 v = a->a; 305 xb = x; 306 307 for (i = 0; i < mbs; i++) { 308 n = ai[1] - ai[0]; /* length of i_th block row of A */ 309 x1 = xb[0]; 310 x2 = xb[1]; 311 ib = aj + *ai; 312 jmin = 0; 313 nonzerorow += (n > 0); 314 if (*ib == i) { /* (diag of A)*x */ 315 z[2 * i] += v[0] * x1 + v[2] * x2; 316 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 317 v += 4; 318 jmin++; 319 } 320 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 321 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 322 for (j = jmin; j < n; j++) { 323 /* (strict lower triangular part of A)*x */ 324 cval = ib[j] * 2; 325 z[cval] += v[0] * x1 + v[1] * x2; 326 z[cval + 1] += v[2] * x1 + v[3] * x2; 327 /* (strict upper triangular part of A)*x */ 328 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 329 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 330 v += 4; 331 } 332 xb += 2; 333 ai++; 334 } 335 336 PetscCall(VecRestoreArrayRead(xx, &x)); 337 PetscCall(VecRestoreArray(zz, &z)); 338 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 339 PetscFunctionReturn(PETSC_SUCCESS); 340 } 341 342 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 343 { 344 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 345 PetscScalar *z, x1, x2, x3, zero = 0.0; 346 const PetscScalar *x, *xb; 347 const MatScalar *v; 348 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 349 const PetscInt *aj = a->j, *ai = a->i, *ib; 350 PetscInt nonzerorow = 0; 351 352 PetscFunctionBegin; 353 PetscCall(VecSet(zz, zero)); 354 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 355 PetscCall(VecGetArrayRead(xx, &x)); 356 PetscCall(VecGetArray(zz, &z)); 357 358 v = a->a; 359 xb = x; 360 361 for (i = 0; i < mbs; i++) { 362 n = ai[1] - ai[0]; /* length of i_th block row of A */ 363 x1 = xb[0]; 364 x2 = xb[1]; 365 x3 = xb[2]; 366 ib = aj + *ai; 367 jmin = 0; 368 nonzerorow += (n > 0); 369 if (*ib == i) { /* (diag of A)*x */ 370 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 371 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 372 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 373 v += 9; 374 jmin++; 375 } 376 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 377 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 378 for (j = jmin; j < n; j++) { 379 /* (strict lower triangular part of A)*x */ 380 cval = ib[j] * 3; 381 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 382 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 383 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 384 /* (strict upper triangular part of A)*x */ 385 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 386 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 387 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 388 v += 9; 389 } 390 xb += 3; 391 ai++; 392 } 393 394 PetscCall(VecRestoreArrayRead(xx, &x)); 395 PetscCall(VecRestoreArray(zz, &z)); 396 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 397 PetscFunctionReturn(PETSC_SUCCESS); 398 } 399 400 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 401 { 402 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 403 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 404 const PetscScalar *x, *xb; 405 const MatScalar *v; 406 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 407 const PetscInt *aj = a->j, *ai = a->i, *ib; 408 PetscInt nonzerorow = 0; 409 410 PetscFunctionBegin; 411 PetscCall(VecSet(zz, zero)); 412 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 413 PetscCall(VecGetArrayRead(xx, &x)); 414 PetscCall(VecGetArray(zz, &z)); 415 416 v = a->a; 417 xb = x; 418 419 for (i = 0; i < mbs; i++) { 420 n = ai[1] - ai[0]; /* length of i_th block row of A */ 421 x1 = xb[0]; 422 x2 = xb[1]; 423 x3 = xb[2]; 424 x4 = xb[3]; 425 ib = aj + *ai; 426 jmin = 0; 427 nonzerorow += (n > 0); 428 if (*ib == i) { /* (diag of A)*x */ 429 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 430 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 431 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 432 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 433 v += 16; 434 jmin++; 435 } 436 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 437 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 438 for (j = jmin; j < n; j++) { 439 /* (strict lower triangular part of A)*x */ 440 cval = ib[j] * 4; 441 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 442 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 443 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 444 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 445 /* (strict upper triangular part of A)*x */ 446 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 447 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 448 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 449 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 450 v += 16; 451 } 452 xb += 4; 453 ai++; 454 } 455 456 PetscCall(VecRestoreArrayRead(xx, &x)); 457 PetscCall(VecRestoreArray(zz, &z)); 458 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 459 PetscFunctionReturn(PETSC_SUCCESS); 460 } 461 462 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 463 { 464 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 465 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 466 const PetscScalar *x, *xb; 467 const MatScalar *v; 468 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 469 const PetscInt *aj = a->j, *ai = a->i, *ib; 470 PetscInt nonzerorow = 0; 471 472 PetscFunctionBegin; 473 PetscCall(VecSet(zz, zero)); 474 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 475 PetscCall(VecGetArrayRead(xx, &x)); 476 PetscCall(VecGetArray(zz, &z)); 477 478 v = a->a; 479 xb = x; 480 481 for (i = 0; i < mbs; i++) { 482 n = ai[1] - ai[0]; /* length of i_th block row of A */ 483 x1 = xb[0]; 484 x2 = xb[1]; 485 x3 = xb[2]; 486 x4 = xb[3]; 487 x5 = xb[4]; 488 ib = aj + *ai; 489 jmin = 0; 490 nonzerorow += (n > 0); 491 if (*ib == i) { /* (diag of A)*x */ 492 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 493 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 494 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 495 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 496 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 497 v += 25; 498 jmin++; 499 } 500 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 501 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 502 for (j = jmin; j < n; j++) { 503 /* (strict lower triangular part of A)*x */ 504 cval = ib[j] * 5; 505 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 506 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 507 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 508 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 509 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 510 /* (strict upper triangular part of A)*x */ 511 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 512 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 513 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 514 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 515 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 516 v += 25; 517 } 518 xb += 5; 519 ai++; 520 } 521 522 PetscCall(VecRestoreArrayRead(xx, &x)); 523 PetscCall(VecRestoreArray(zz, &z)); 524 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 525 PetscFunctionReturn(PETSC_SUCCESS); 526 } 527 528 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 529 { 530 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 531 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 532 const PetscScalar *x, *xb; 533 const MatScalar *v; 534 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 535 const PetscInt *aj = a->j, *ai = a->i, *ib; 536 PetscInt nonzerorow = 0; 537 538 PetscFunctionBegin; 539 PetscCall(VecSet(zz, zero)); 540 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 541 PetscCall(VecGetArrayRead(xx, &x)); 542 PetscCall(VecGetArray(zz, &z)); 543 544 v = a->a; 545 xb = x; 546 547 for (i = 0; i < mbs; i++) { 548 n = ai[1] - ai[0]; /* length of i_th block row of A */ 549 x1 = xb[0]; 550 x2 = xb[1]; 551 x3 = xb[2]; 552 x4 = xb[3]; 553 x5 = xb[4]; 554 x6 = xb[5]; 555 ib = aj + *ai; 556 jmin = 0; 557 nonzerorow += (n > 0); 558 if (*ib == i) { /* (diag of A)*x */ 559 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 560 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 561 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 562 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 563 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 564 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 565 v += 36; 566 jmin++; 567 } 568 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 569 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 570 for (j = jmin; j < n; j++) { 571 /* (strict lower triangular part of A)*x */ 572 cval = ib[j] * 6; 573 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 574 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 575 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 576 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 577 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 578 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 579 /* (strict upper triangular part of A)*x */ 580 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 581 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 582 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 583 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 584 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 585 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 586 v += 36; 587 } 588 xb += 6; 589 ai++; 590 } 591 592 PetscCall(VecRestoreArrayRead(xx, &x)); 593 PetscCall(VecRestoreArray(zz, &z)); 594 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 595 PetscFunctionReturn(PETSC_SUCCESS); 596 } 597 598 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 599 { 600 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 601 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 602 const PetscScalar *x, *xb; 603 const MatScalar *v; 604 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 605 const PetscInt *aj = a->j, *ai = a->i, *ib; 606 PetscInt nonzerorow = 0; 607 608 PetscFunctionBegin; 609 PetscCall(VecSet(zz, zero)); 610 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 611 PetscCall(VecGetArrayRead(xx, &x)); 612 PetscCall(VecGetArray(zz, &z)); 613 614 v = a->a; 615 xb = x; 616 617 for (i = 0; i < mbs; i++) { 618 n = ai[1] - ai[0]; /* length of i_th block row of A */ 619 x1 = xb[0]; 620 x2 = xb[1]; 621 x3 = xb[2]; 622 x4 = xb[3]; 623 x5 = xb[4]; 624 x6 = xb[5]; 625 x7 = xb[6]; 626 ib = aj + *ai; 627 jmin = 0; 628 nonzerorow += (n > 0); 629 if (*ib == i) { /* (diag of A)*x */ 630 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 631 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 632 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 633 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 634 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 635 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 636 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 637 v += 49; 638 jmin++; 639 } 640 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 641 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 642 for (j = jmin; j < n; j++) { 643 /* (strict lower triangular part of A)*x */ 644 cval = ib[j] * 7; 645 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 646 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 647 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 648 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 649 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 650 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 651 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 652 /* (strict upper triangular part of A)*x */ 653 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 654 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 655 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 656 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 657 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 658 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 659 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 660 v += 49; 661 } 662 xb += 7; 663 ai++; 664 } 665 PetscCall(VecRestoreArrayRead(xx, &x)); 666 PetscCall(VecRestoreArray(zz, &z)); 667 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 668 PetscFunctionReturn(PETSC_SUCCESS); 669 } 670 671 /* 672 This will not work with MatScalar == float because it calls the BLAS 673 */ 674 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 675 { 676 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 677 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 678 const PetscScalar *x, *x_ptr, *xb; 679 const MatScalar *v; 680 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 681 const PetscInt *idx, *aj, *ii; 682 PetscInt nonzerorow = 0; 683 684 PetscFunctionBegin; 685 PetscCall(VecSet(zz, zero)); 686 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 687 PetscCall(VecGetArrayRead(xx, &x)); 688 PetscCall(VecGetArray(zz, &z)); 689 690 x_ptr = x; 691 z_ptr = z; 692 693 aj = a->j; 694 v = a->a; 695 ii = a->i; 696 697 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 698 work = a->mult_work; 699 700 for (i = 0; i < mbs; i++) { 701 n = ii[1] - ii[0]; 702 ncols = n * bs; 703 workt = work; 704 idx = aj + ii[0]; 705 nonzerorow += (n > 0); 706 707 /* upper triangular part */ 708 for (j = 0; j < n; j++) { 709 xb = x_ptr + bs * (*idx++); 710 for (k = 0; k < bs; k++) workt[k] = xb[k]; 711 workt += bs; 712 } 713 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 714 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 715 716 /* strict lower triangular part */ 717 idx = aj + ii[0]; 718 if (n && *idx == i) { 719 ncols -= bs; 720 v += bs2; 721 idx++; 722 n--; 723 } 724 725 if (ncols > 0) { 726 workt = work; 727 PetscCall(PetscArrayzero(workt, ncols)); 728 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 729 for (j = 0; j < n; j++) { 730 zb = z_ptr + bs * (*idx++); 731 for (k = 0; k < bs; k++) zb[k] += workt[k]; 732 workt += bs; 733 } 734 } 735 x += bs; 736 v += n * bs2; 737 z += bs; 738 ii++; 739 } 740 741 PetscCall(VecRestoreArrayRead(xx, &x)); 742 PetscCall(VecRestoreArray(zz, &z)); 743 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 744 PetscFunctionReturn(PETSC_SUCCESS); 745 } 746 747 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 748 { 749 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 750 PetscScalar *z, x1; 751 const PetscScalar *x, *xb; 752 const MatScalar *v; 753 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 754 const PetscInt *aj = a->j, *ai = a->i, *ib; 755 PetscInt nonzerorow = 0; 756 #if defined(PETSC_USE_COMPLEX) 757 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 758 #else 759 const int aconj = 0; 760 #endif 761 762 PetscFunctionBegin; 763 PetscCall(VecCopy(yy, zz)); 764 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 765 PetscCall(VecGetArrayRead(xx, &x)); 766 PetscCall(VecGetArray(zz, &z)); 767 v = a->a; 768 xb = x; 769 770 for (i = 0; i < mbs; i++) { 771 n = ai[1] - ai[0]; /* length of i_th row of A */ 772 x1 = xb[0]; 773 ib = aj + *ai; 774 jmin = 0; 775 nonzerorow += (n > 0); 776 if (n && *ib == i) { /* (diag of A)*x */ 777 z[i] += *v++ * x[*ib++]; 778 jmin++; 779 } 780 if (aconj) { 781 for (j = jmin; j < n; j++) { 782 cval = *ib; 783 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 784 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 785 } 786 } else { 787 for (j = jmin; j < n; j++) { 788 cval = *ib; 789 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 790 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 791 } 792 } 793 xb++; 794 ai++; 795 } 796 797 PetscCall(VecRestoreArrayRead(xx, &x)); 798 PetscCall(VecRestoreArray(zz, &z)); 799 800 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 801 PetscFunctionReturn(PETSC_SUCCESS); 802 } 803 804 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 805 { 806 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 807 PetscScalar *z, x1, x2; 808 const PetscScalar *x, *xb; 809 const MatScalar *v; 810 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 811 const PetscInt *aj = a->j, *ai = a->i, *ib; 812 PetscInt nonzerorow = 0; 813 814 PetscFunctionBegin; 815 PetscCall(VecCopy(yy, zz)); 816 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 817 PetscCall(VecGetArrayRead(xx, &x)); 818 PetscCall(VecGetArray(zz, &z)); 819 820 v = a->a; 821 xb = x; 822 823 for (i = 0; i < mbs; i++) { 824 n = ai[1] - ai[0]; /* length of i_th block row of A */ 825 x1 = xb[0]; 826 x2 = xb[1]; 827 ib = aj + *ai; 828 jmin = 0; 829 nonzerorow += (n > 0); 830 if (n && *ib == i) { /* (diag of A)*x */ 831 z[2 * i] += v[0] * x1 + v[2] * x2; 832 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 833 v += 4; 834 jmin++; 835 } 836 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 837 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 838 for (j = jmin; j < n; j++) { 839 /* (strict lower triangular part of A)*x */ 840 cval = ib[j] * 2; 841 z[cval] += v[0] * x1 + v[1] * x2; 842 z[cval + 1] += v[2] * x1 + v[3] * x2; 843 /* (strict upper triangular part of A)*x */ 844 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 845 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 846 v += 4; 847 } 848 xb += 2; 849 ai++; 850 } 851 PetscCall(VecRestoreArrayRead(xx, &x)); 852 PetscCall(VecRestoreArray(zz, &z)); 853 854 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 855 PetscFunctionReturn(PETSC_SUCCESS); 856 } 857 858 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 859 { 860 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 861 PetscScalar *z, x1, x2, x3; 862 const PetscScalar *x, *xb; 863 const MatScalar *v; 864 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 865 const PetscInt *aj = a->j, *ai = a->i, *ib; 866 PetscInt nonzerorow = 0; 867 868 PetscFunctionBegin; 869 PetscCall(VecCopy(yy, zz)); 870 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 871 PetscCall(VecGetArrayRead(xx, &x)); 872 PetscCall(VecGetArray(zz, &z)); 873 874 v = a->a; 875 xb = x; 876 877 for (i = 0; i < mbs; i++) { 878 n = ai[1] - ai[0]; /* length of i_th block row of A */ 879 x1 = xb[0]; 880 x2 = xb[1]; 881 x3 = xb[2]; 882 ib = aj + *ai; 883 jmin = 0; 884 nonzerorow += (n > 0); 885 if (n && *ib == i) { /* (diag of A)*x */ 886 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 887 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 888 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 889 v += 9; 890 jmin++; 891 } 892 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 893 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 894 for (j = jmin; j < n; j++) { 895 /* (strict lower triangular part of A)*x */ 896 cval = ib[j] * 3; 897 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 898 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 899 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 900 /* (strict upper triangular part of A)*x */ 901 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 902 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 903 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 904 v += 9; 905 } 906 xb += 3; 907 ai++; 908 } 909 910 PetscCall(VecRestoreArrayRead(xx, &x)); 911 PetscCall(VecRestoreArray(zz, &z)); 912 913 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 914 PetscFunctionReturn(PETSC_SUCCESS); 915 } 916 917 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 918 { 919 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 920 PetscScalar *z, x1, x2, x3, x4; 921 const PetscScalar *x, *xb; 922 const MatScalar *v; 923 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 924 const PetscInt *aj = a->j, *ai = a->i, *ib; 925 PetscInt nonzerorow = 0; 926 927 PetscFunctionBegin; 928 PetscCall(VecCopy(yy, zz)); 929 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 930 PetscCall(VecGetArrayRead(xx, &x)); 931 PetscCall(VecGetArray(zz, &z)); 932 933 v = a->a; 934 xb = x; 935 936 for (i = 0; i < mbs; i++) { 937 n = ai[1] - ai[0]; /* length of i_th block row of A */ 938 x1 = xb[0]; 939 x2 = xb[1]; 940 x3 = xb[2]; 941 x4 = xb[3]; 942 ib = aj + *ai; 943 jmin = 0; 944 nonzerorow += (n > 0); 945 if (n && *ib == i) { /* (diag of A)*x */ 946 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 947 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 948 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 949 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 950 v += 16; 951 jmin++; 952 } 953 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 954 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 955 for (j = jmin; j < n; j++) { 956 /* (strict lower triangular part of A)*x */ 957 cval = ib[j] * 4; 958 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 959 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 960 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 961 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 962 /* (strict upper triangular part of A)*x */ 963 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 964 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 965 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 966 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 967 v += 16; 968 } 969 xb += 4; 970 ai++; 971 } 972 973 PetscCall(VecRestoreArrayRead(xx, &x)); 974 PetscCall(VecRestoreArray(zz, &z)); 975 976 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 977 PetscFunctionReturn(PETSC_SUCCESS); 978 } 979 980 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 981 { 982 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 983 PetscScalar *z, x1, x2, x3, x4, x5; 984 const PetscScalar *x, *xb; 985 const MatScalar *v; 986 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 987 const PetscInt *aj = a->j, *ai = a->i, *ib; 988 PetscInt nonzerorow = 0; 989 990 PetscFunctionBegin; 991 PetscCall(VecCopy(yy, zz)); 992 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 993 PetscCall(VecGetArrayRead(xx, &x)); 994 PetscCall(VecGetArray(zz, &z)); 995 996 v = a->a; 997 xb = x; 998 999 for (i = 0; i < mbs; i++) { 1000 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1001 x1 = xb[0]; 1002 x2 = xb[1]; 1003 x3 = xb[2]; 1004 x4 = xb[3]; 1005 x5 = xb[4]; 1006 ib = aj + *ai; 1007 jmin = 0; 1008 nonzerorow += (n > 0); 1009 if (n && *ib == i) { /* (diag of A)*x */ 1010 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1011 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1012 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1013 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 1014 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1015 v += 25; 1016 jmin++; 1017 } 1018 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1019 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1020 for (j = jmin; j < n; j++) { 1021 /* (strict lower triangular part of A)*x */ 1022 cval = ib[j] * 5; 1023 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 1024 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1025 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1026 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1027 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1028 /* (strict upper triangular part of A)*x */ 1029 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1030 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1031 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1032 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1033 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1034 v += 25; 1035 } 1036 xb += 5; 1037 ai++; 1038 } 1039 1040 PetscCall(VecRestoreArrayRead(xx, &x)); 1041 PetscCall(VecRestoreArray(zz, &z)); 1042 1043 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1044 PetscFunctionReturn(PETSC_SUCCESS); 1045 } 1046 1047 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1048 { 1049 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1050 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1051 const PetscScalar *x, *xb; 1052 const MatScalar *v; 1053 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1054 const PetscInt *aj = a->j, *ai = a->i, *ib; 1055 PetscInt nonzerorow = 0; 1056 1057 PetscFunctionBegin; 1058 PetscCall(VecCopy(yy, zz)); 1059 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1060 PetscCall(VecGetArrayRead(xx, &x)); 1061 PetscCall(VecGetArray(zz, &z)); 1062 1063 v = a->a; 1064 xb = x; 1065 1066 for (i = 0; i < mbs; i++) { 1067 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1068 x1 = xb[0]; 1069 x2 = xb[1]; 1070 x3 = xb[2]; 1071 x4 = xb[3]; 1072 x5 = xb[4]; 1073 x6 = xb[5]; 1074 ib = aj + *ai; 1075 jmin = 0; 1076 nonzerorow += (n > 0); 1077 if (n && *ib == i) { /* (diag of A)*x */ 1078 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1079 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1080 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1081 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1082 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1083 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1084 v += 36; 1085 jmin++; 1086 } 1087 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1088 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1089 for (j = jmin; j < n; j++) { 1090 /* (strict lower triangular part of A)*x */ 1091 cval = ib[j] * 6; 1092 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1093 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1094 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1095 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1096 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1097 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1098 /* (strict upper triangular part of A)*x */ 1099 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1100 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1101 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1102 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1103 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1104 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1105 v += 36; 1106 } 1107 xb += 6; 1108 ai++; 1109 } 1110 1111 PetscCall(VecRestoreArrayRead(xx, &x)); 1112 PetscCall(VecRestoreArray(zz, &z)); 1113 1114 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1115 PetscFunctionReturn(PETSC_SUCCESS); 1116 } 1117 1118 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1119 { 1120 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1121 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1122 const PetscScalar *x, *xb; 1123 const MatScalar *v; 1124 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1125 const PetscInt *aj = a->j, *ai = a->i, *ib; 1126 PetscInt nonzerorow = 0; 1127 1128 PetscFunctionBegin; 1129 PetscCall(VecCopy(yy, zz)); 1130 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1131 PetscCall(VecGetArrayRead(xx, &x)); 1132 PetscCall(VecGetArray(zz, &z)); 1133 1134 v = a->a; 1135 xb = x; 1136 1137 for (i = 0; i < mbs; i++) { 1138 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1139 x1 = xb[0]; 1140 x2 = xb[1]; 1141 x3 = xb[2]; 1142 x4 = xb[3]; 1143 x5 = xb[4]; 1144 x6 = xb[5]; 1145 x7 = xb[6]; 1146 ib = aj + *ai; 1147 jmin = 0; 1148 nonzerorow += (n > 0); 1149 if (n && *ib == i) { /* (diag of A)*x */ 1150 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1151 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1152 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1153 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1154 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1155 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1156 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1157 v += 49; 1158 jmin++; 1159 } 1160 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1161 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1162 for (j = jmin; j < n; j++) { 1163 /* (strict lower triangular part of A)*x */ 1164 cval = ib[j] * 7; 1165 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1166 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1167 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1168 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1169 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1170 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1171 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1172 /* (strict upper triangular part of A)*x */ 1173 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1174 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1175 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1176 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1177 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1178 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1179 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1180 v += 49; 1181 } 1182 xb += 7; 1183 ai++; 1184 } 1185 1186 PetscCall(VecRestoreArrayRead(xx, &x)); 1187 PetscCall(VecRestoreArray(zz, &z)); 1188 1189 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1190 PetscFunctionReturn(PETSC_SUCCESS); 1191 } 1192 1193 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1194 { 1195 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1196 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1197 const PetscScalar *x, *x_ptr, *xb; 1198 const MatScalar *v; 1199 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1200 const PetscInt *idx, *aj, *ii; 1201 PetscInt nonzerorow = 0; 1202 1203 PetscFunctionBegin; 1204 PetscCall(VecCopy(yy, zz)); 1205 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1206 PetscCall(VecGetArrayRead(xx, &x)); 1207 x_ptr = x; 1208 PetscCall(VecGetArray(zz, &z)); 1209 z_ptr = z; 1210 1211 aj = a->j; 1212 v = a->a; 1213 ii = a->i; 1214 1215 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1216 work = a->mult_work; 1217 1218 for (i = 0; i < mbs; i++) { 1219 n = ii[1] - ii[0]; 1220 ncols = n * bs; 1221 workt = work; 1222 idx = aj + ii[0]; 1223 nonzerorow += (n > 0); 1224 1225 /* upper triangular part */ 1226 for (j = 0; j < n; j++) { 1227 xb = x_ptr + bs * (*idx++); 1228 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1229 workt += bs; 1230 } 1231 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1232 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1233 1234 /* strict lower triangular part */ 1235 idx = aj + ii[0]; 1236 if (n && *idx == i) { 1237 ncols -= bs; 1238 v += bs2; 1239 idx++; 1240 n--; 1241 } 1242 if (ncols > 0) { 1243 workt = work; 1244 PetscCall(PetscArrayzero(workt, ncols)); 1245 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1246 for (j = 0; j < n; j++) { 1247 zb = z_ptr + bs * (*idx++); 1248 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1249 workt += bs; 1250 } 1251 } 1252 1253 x += bs; 1254 v += n * bs2; 1255 z += bs; 1256 ii++; 1257 } 1258 1259 PetscCall(VecRestoreArrayRead(xx, &x)); 1260 PetscCall(VecRestoreArray(zz, &z)); 1261 1262 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1263 PetscFunctionReturn(PETSC_SUCCESS); 1264 } 1265 1266 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1267 { 1268 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1269 PetscScalar oalpha = alpha; 1270 PetscBLASInt one = 1, totalnz; 1271 1272 PetscFunctionBegin; 1273 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1274 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1275 PetscCall(PetscLogFlops(totalnz)); 1276 PetscFunctionReturn(PETSC_SUCCESS); 1277 } 1278 1279 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1280 { 1281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1282 const MatScalar *v = a->a; 1283 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1284 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1285 const PetscInt *aj = a->j, *col; 1286 1287 PetscFunctionBegin; 1288 if (!a->nz) { 1289 *norm = 0.0; 1290 PetscFunctionReturn(PETSC_SUCCESS); 1291 } 1292 if (type == NORM_FROBENIUS) { 1293 for (k = 0; k < mbs; k++) { 1294 jmin = a->i[k]; 1295 jmax = a->i[k + 1]; 1296 col = aj + jmin; 1297 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1298 for (i = 0; i < bs2; i++) { 1299 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1300 v++; 1301 } 1302 jmin++; 1303 } 1304 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1305 for (i = 0; i < bs2; i++) { 1306 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1307 v++; 1308 } 1309 } 1310 } 1311 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1312 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1313 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1314 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1315 for (i = 0; i < mbs; i++) jl[i] = mbs; 1316 il[0] = 0; 1317 1318 *norm = 0.0; 1319 for (k = 0; k < mbs; k++) { /* k_th block row */ 1320 for (j = 0; j < bs; j++) sum[j] = 0.0; 1321 /*-- col sum --*/ 1322 i = jl[k]; /* first |A(i,k)| to be added */ 1323 /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1324 at step k */ 1325 while (i < mbs) { 1326 nexti = jl[i]; /* next block row to be added */ 1327 ik = il[i]; /* block index of A(i,k) in the array a */ 1328 for (j = 0; j < bs; j++) { 1329 v = a->a + ik * bs2 + j * bs; 1330 for (k1 = 0; k1 < bs; k1++) { 1331 sum[j] += PetscAbsScalar(*v); 1332 v++; 1333 } 1334 } 1335 /* update il, jl */ 1336 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1337 jmax = a->i[i + 1]; 1338 if (jmin < jmax) { 1339 il[i] = jmin; 1340 j = a->j[jmin]; 1341 jl[i] = jl[j]; 1342 jl[j] = i; 1343 } 1344 i = nexti; 1345 } 1346 /*-- row sum --*/ 1347 jmin = a->i[k]; 1348 jmax = a->i[k + 1]; 1349 for (i = jmin; i < jmax; i++) { 1350 for (j = 0; j < bs; j++) { 1351 v = a->a + i * bs2 + j; 1352 for (k1 = 0; k1 < bs; k1++) { 1353 sum[j] += PetscAbsScalar(*v); 1354 v += bs; 1355 } 1356 } 1357 } 1358 /* add k_th block row to il, jl */ 1359 col = aj + jmin; 1360 if (jmax - jmin > 0 && *col == k) jmin++; 1361 if (jmin < jmax) { 1362 il[k] = jmin; 1363 j = a->j[jmin]; 1364 jl[k] = jl[j]; 1365 jl[j] = k; 1366 } 1367 for (j = 0; j < bs; j++) { 1368 if (sum[j] > *norm) *norm = sum[j]; 1369 } 1370 } 1371 PetscCall(PetscFree3(sum, il, jl)); 1372 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1373 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1374 PetscFunctionReturn(PETSC_SUCCESS); 1375 } 1376 1377 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1378 { 1379 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1380 1381 PetscFunctionBegin; 1382 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1383 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1384 *flg = PETSC_FALSE; 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 /* if the a->i are the same */ 1389 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1390 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1391 1392 /* if a->j are the same */ 1393 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1394 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1395 1396 /* if a->a are the same */ 1397 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1398 PetscFunctionReturn(PETSC_SUCCESS); 1399 } 1400 1401 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1402 { 1403 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1404 PetscInt i, j, k, row, bs, ambs, bs2; 1405 const PetscInt *ai, *aj; 1406 PetscScalar *x, zero = 0.0; 1407 const MatScalar *aa, *aa_j; 1408 1409 PetscFunctionBegin; 1410 bs = A->rmap->bs; 1411 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1412 1413 aa = a->a; 1414 ambs = a->mbs; 1415 1416 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1417 PetscInt *diag = a->diag; 1418 aa = a->a; 1419 ambs = a->mbs; 1420 PetscCall(VecGetArray(v, &x)); 1421 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1422 PetscCall(VecRestoreArray(v, &x)); 1423 PetscFunctionReturn(PETSC_SUCCESS); 1424 } 1425 1426 ai = a->i; 1427 aj = a->j; 1428 bs2 = a->bs2; 1429 PetscCall(VecSet(v, zero)); 1430 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1431 PetscCall(VecGetArray(v, &x)); 1432 for (i = 0; i < ambs; i++) { 1433 j = ai[i]; 1434 if (aj[j] == i) { /* if this is a diagonal element */ 1435 row = i * bs; 1436 aa_j = aa + j * bs2; 1437 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1438 } 1439 } 1440 PetscCall(VecRestoreArray(v, &x)); 1441 PetscFunctionReturn(PETSC_SUCCESS); 1442 } 1443 1444 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1445 { 1446 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1447 PetscScalar x; 1448 const PetscScalar *l, *li, *ri; 1449 MatScalar *aa, *v; 1450 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1451 const PetscInt *ai, *aj; 1452 PetscBool flg; 1453 1454 PetscFunctionBegin; 1455 if (ll != rr) { 1456 PetscCall(VecEqual(ll, rr, &flg)); 1457 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1458 } 1459 if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 1460 ai = a->i; 1461 aj = a->j; 1462 aa = a->a; 1463 m = A->rmap->N; 1464 bs = A->rmap->bs; 1465 mbs = a->mbs; 1466 bs2 = a->bs2; 1467 1468 PetscCall(VecGetArrayRead(ll, &l)); 1469 PetscCall(VecGetLocalSize(ll, &lm)); 1470 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1471 for (i = 0; i < mbs; i++) { /* for each block row */ 1472 M = ai[i + 1] - ai[i]; 1473 li = l + i * bs; 1474 v = aa + bs2 * ai[i]; 1475 for (j = 0; j < M; j++) { /* for each block */ 1476 ri = l + bs * aj[ai[i] + j]; 1477 for (k = 0; k < bs; k++) { 1478 x = ri[k]; 1479 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1480 } 1481 } 1482 } 1483 PetscCall(VecRestoreArrayRead(ll, &l)); 1484 PetscCall(PetscLogFlops(2.0 * a->nz)); 1485 PetscFunctionReturn(PETSC_SUCCESS); 1486 } 1487 1488 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1489 { 1490 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1491 1492 PetscFunctionBegin; 1493 info->block_size = a->bs2; 1494 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1495 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1496 info->nz_unneeded = info->nz_allocated - info->nz_used; 1497 info->assemblies = A->num_ass; 1498 info->mallocs = A->info.mallocs; 1499 info->memory = 0; /* REVIEW ME */ 1500 if (A->factortype) { 1501 info->fill_ratio_given = A->info.fill_ratio_given; 1502 info->fill_ratio_needed = A->info.fill_ratio_needed; 1503 info->factor_mallocs = A->info.factor_mallocs; 1504 } else { 1505 info->fill_ratio_given = 0; 1506 info->fill_ratio_needed = 0; 1507 info->factor_mallocs = 0; 1508 } 1509 PetscFunctionReturn(PETSC_SUCCESS); 1510 } 1511 1512 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1513 { 1514 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1515 1516 PetscFunctionBegin; 1517 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1518 PetscFunctionReturn(PETSC_SUCCESS); 1519 } 1520 1521 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1522 { 1523 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1524 PetscInt i, j, n, row, col, bs, mbs; 1525 const PetscInt *ai, *aj; 1526 PetscReal atmp; 1527 const MatScalar *aa; 1528 PetscScalar *x; 1529 PetscInt ncols, brow, bcol, krow, kcol; 1530 1531 PetscFunctionBegin; 1532 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1533 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1534 bs = A->rmap->bs; 1535 aa = a->a; 1536 ai = a->i; 1537 aj = a->j; 1538 mbs = a->mbs; 1539 1540 PetscCall(VecSet(v, 0.0)); 1541 PetscCall(VecGetArray(v, &x)); 1542 PetscCall(VecGetLocalSize(v, &n)); 1543 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1544 for (i = 0; i < mbs; i++) { 1545 ncols = ai[1] - ai[0]; 1546 ai++; 1547 brow = bs * i; 1548 for (j = 0; j < ncols; j++) { 1549 bcol = bs * (*aj); 1550 for (kcol = 0; kcol < bs; kcol++) { 1551 col = bcol + kcol; /* col index */ 1552 for (krow = 0; krow < bs; krow++) { 1553 atmp = PetscAbsScalar(*aa); 1554 aa++; 1555 row = brow + krow; /* row index */ 1556 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1557 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1558 } 1559 } 1560 aj++; 1561 } 1562 } 1563 PetscCall(VecRestoreArray(v, &x)); 1564 PetscFunctionReturn(PETSC_SUCCESS); 1565 } 1566 1567 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1568 { 1569 PetscFunctionBegin; 1570 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1571 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1572 PetscFunctionReturn(PETSC_SUCCESS); 1573 } 1574 1575 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1576 { 1577 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1578 PetscScalar *z = c; 1579 const PetscScalar *xb; 1580 PetscScalar x1; 1581 const MatScalar *v = a->a, *vv; 1582 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1583 #if defined(PETSC_USE_COMPLEX) 1584 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1585 #else 1586 const int aconj = 0; 1587 #endif 1588 1589 PetscFunctionBegin; 1590 for (i = 0; i < mbs; i++) { 1591 n = ii[1] - ii[0]; 1592 ii++; 1593 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1594 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1595 jj = idx; 1596 vv = v; 1597 for (k = 0; k < cn; k++) { 1598 idx = jj; 1599 v = vv; 1600 for (j = 0; j < n; j++) { 1601 xb = b + (*idx); 1602 x1 = xb[0 + k * bm]; 1603 z[0 + k * cm] += v[0] * x1; 1604 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1605 v += 1; 1606 ++idx; 1607 } 1608 } 1609 z += 1; 1610 } 1611 PetscFunctionReturn(PETSC_SUCCESS); 1612 } 1613 1614 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1615 { 1616 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1617 PetscScalar *z = c; 1618 const PetscScalar *xb; 1619 PetscScalar x1, x2; 1620 const MatScalar *v = a->a, *vv; 1621 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1622 1623 PetscFunctionBegin; 1624 for (i = 0; i < mbs; i++) { 1625 n = ii[1] - ii[0]; 1626 ii++; 1627 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1628 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1629 jj = idx; 1630 vv = v; 1631 for (k = 0; k < cn; k++) { 1632 idx = jj; 1633 v = vv; 1634 for (j = 0; j < n; j++) { 1635 xb = b + 2 * (*idx); 1636 x1 = xb[0 + k * bm]; 1637 x2 = xb[1 + k * bm]; 1638 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1639 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1640 if (*idx != i) { 1641 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1642 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1643 } 1644 v += 4; 1645 ++idx; 1646 } 1647 } 1648 z += 2; 1649 } 1650 PetscFunctionReturn(PETSC_SUCCESS); 1651 } 1652 1653 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1654 { 1655 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1656 PetscScalar *z = c; 1657 const PetscScalar *xb; 1658 PetscScalar x1, x2, x3; 1659 const MatScalar *v = a->a, *vv; 1660 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1661 1662 PetscFunctionBegin; 1663 for (i = 0; i < mbs; i++) { 1664 n = ii[1] - ii[0]; 1665 ii++; 1666 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1667 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1668 jj = idx; 1669 vv = v; 1670 for (k = 0; k < cn; k++) { 1671 idx = jj; 1672 v = vv; 1673 for (j = 0; j < n; j++) { 1674 xb = b + 3 * (*idx); 1675 x1 = xb[0 + k * bm]; 1676 x2 = xb[1 + k * bm]; 1677 x3 = xb[2 + k * bm]; 1678 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1679 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1680 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1681 if (*idx != i) { 1682 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1683 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1684 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1685 } 1686 v += 9; 1687 ++idx; 1688 } 1689 } 1690 z += 3; 1691 } 1692 PetscFunctionReturn(PETSC_SUCCESS); 1693 } 1694 1695 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1696 { 1697 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1698 PetscScalar *z = c; 1699 const PetscScalar *xb; 1700 PetscScalar x1, x2, x3, x4; 1701 const MatScalar *v = a->a, *vv; 1702 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1703 1704 PetscFunctionBegin; 1705 for (i = 0; i < mbs; i++) { 1706 n = ii[1] - ii[0]; 1707 ii++; 1708 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1709 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1710 jj = idx; 1711 vv = v; 1712 for (k = 0; k < cn; k++) { 1713 idx = jj; 1714 v = vv; 1715 for (j = 0; j < n; j++) { 1716 xb = b + 4 * (*idx); 1717 x1 = xb[0 + k * bm]; 1718 x2 = xb[1 + k * bm]; 1719 x3 = xb[2 + k * bm]; 1720 x4 = xb[3 + k * bm]; 1721 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1722 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1723 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1724 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1725 if (*idx != i) { 1726 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1727 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1728 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1729 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1730 } 1731 v += 16; 1732 ++idx; 1733 } 1734 } 1735 z += 4; 1736 } 1737 PetscFunctionReturn(PETSC_SUCCESS); 1738 } 1739 1740 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1741 { 1742 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1743 PetscScalar *z = c; 1744 const PetscScalar *xb; 1745 PetscScalar x1, x2, x3, x4, x5; 1746 const MatScalar *v = a->a, *vv; 1747 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1748 1749 PetscFunctionBegin; 1750 for (i = 0; i < mbs; i++) { 1751 n = ii[1] - ii[0]; 1752 ii++; 1753 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1754 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1755 jj = idx; 1756 vv = v; 1757 for (k = 0; k < cn; k++) { 1758 idx = jj; 1759 v = vv; 1760 for (j = 0; j < n; j++) { 1761 xb = b + 5 * (*idx); 1762 x1 = xb[0 + k * bm]; 1763 x2 = xb[1 + k * bm]; 1764 x3 = xb[2 + k * bm]; 1765 x4 = xb[3 + k * bm]; 1766 x5 = xb[4 + k * cm]; 1767 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1768 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1769 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1770 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1771 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1772 if (*idx != i) { 1773 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1774 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1775 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1776 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1777 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1778 } 1779 v += 25; 1780 ++idx; 1781 } 1782 } 1783 z += 5; 1784 } 1785 PetscFunctionReturn(PETSC_SUCCESS); 1786 } 1787 1788 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1789 { 1790 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1791 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1792 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1793 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1794 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1795 PetscBLASInt bbs, bcn, bbm, bcm; 1796 PetscScalar *z = NULL; 1797 PetscScalar *c, *b; 1798 const MatScalar *v; 1799 const PetscInt *idx, *ii; 1800 PetscScalar _DOne = 1.0; 1801 1802 PetscFunctionBegin; 1803 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1804 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1805 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1806 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1807 b = bd->v; 1808 PetscCall(MatZeroEntries(C)); 1809 PetscCall(MatDenseGetArray(C, &c)); 1810 switch (bs) { 1811 case 1: 1812 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1813 break; 1814 case 2: 1815 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1816 break; 1817 case 3: 1818 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1819 break; 1820 case 4: 1821 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1822 break; 1823 case 5: 1824 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1825 break; 1826 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1827 PetscCall(PetscBLASIntCast(bs, &bbs)); 1828 PetscCall(PetscBLASIntCast(cn, &bcn)); 1829 PetscCall(PetscBLASIntCast(bm, &bbm)); 1830 PetscCall(PetscBLASIntCast(cm, &bcm)); 1831 idx = a->j; 1832 v = a->a; 1833 mbs = a->mbs; 1834 ii = a->i; 1835 z = c; 1836 for (i = 0; i < mbs; i++) { 1837 n = ii[1] - ii[0]; 1838 ii++; 1839 for (j = 0; j < n; j++) { 1840 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1841 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1842 v += bs2; 1843 } 1844 z += bs; 1845 } 1846 } 1847 PetscCall(MatDenseRestoreArray(C, &c)); 1848 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1849 PetscFunctionReturn(PETSC_SUCCESS); 1850 } 1851