1 2 /* 3 Defines the basic matrix operations for the BAIJ (compressed row) 4 matrix storage format. 5 */ 6 #include <../src/mat/impls/baij/seq/baij.h> /*I "petscmat.h" I*/ 7 #include <petscblaslapack.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 #include <petsc/private/kernels/blockmatmult.h> 10 11 #if defined(PETSC_HAVE_HYPRE) 12 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *); 13 #endif 14 15 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE) 16 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *); 17 #endif 18 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *); 19 20 PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions) { 21 Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data; 22 PetscInt m, n, i; 23 PetscInt ib, jb, bs = A->rmap->bs; 24 MatScalar *a_val = a_aij->a; 25 26 PetscFunctionBegin; 27 PetscCall(MatGetSize(A, &m, &n)); 28 for (i = 0; i < n; i++) reductions[i] = 0.0; 29 if (type == NORM_2) { 30 for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) { 31 for (jb = 0; jb < bs; jb++) { 32 for (ib = 0; ib < bs; ib++) { 33 reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val); 34 a_val++; 35 } 36 } 37 } 38 } else if (type == NORM_1) { 39 for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) { 40 for (jb = 0; jb < bs; jb++) { 41 for (ib = 0; ib < bs; ib++) { 42 reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val); 43 a_val++; 44 } 45 } 46 } 47 } else if (type == NORM_INFINITY) { 48 for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) { 49 for (jb = 0; jb < bs; jb++) { 50 for (ib = 0; ib < bs; ib++) { 51 int col = A->cmap->rstart + a_aij->j[i] * bs + jb; 52 reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]); 53 a_val++; 54 } 55 } 56 } 57 } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) { 58 for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) { 59 for (jb = 0; jb < bs; jb++) { 60 for (ib = 0; ib < bs; ib++) { 61 reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val); 62 a_val++; 63 } 64 } 65 } 66 } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) { 67 for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) { 68 for (jb = 0; jb < bs; jb++) { 69 for (ib = 0; ib < bs; ib++) { 70 reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val); 71 a_val++; 72 } 73 } 74 } 75 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type"); 76 if (type == NORM_2) { 77 for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]); 78 } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) { 79 for (i = 0; i < n; i++) reductions[i] /= m; 80 } 81 PetscFunctionReturn(0); 82 } 83 84 PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values) { 85 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 86 PetscInt *diag_offset, i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots; 87 MatScalar *v = a->a, *odiag, *diag, work[25], *v_work; 88 PetscReal shift = 0.0; 89 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 90 91 PetscFunctionBegin; 92 allowzeropivot = PetscNot(A->erroriffailure); 93 94 if (a->idiagvalid) { 95 if (values) *values = a->idiag; 96 PetscFunctionReturn(0); 97 } 98 PetscCall(MatMarkDiagonal_SeqBAIJ(A)); 99 diag_offset = a->diag; 100 if (!a->idiag) { PetscCall(PetscMalloc1(bs2 * mbs, &a->idiag)); } 101 diag = a->idiag; 102 if (values) *values = a->idiag; 103 /* factor and invert each block */ 104 switch (bs) { 105 case 1: 106 for (i = 0; i < mbs; i++) { 107 odiag = v + 1 * diag_offset[i]; 108 diag[0] = odiag[0]; 109 110 if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) { 111 if (allowzeropivot) { 112 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 113 A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]); 114 A->factorerror_zeropivot_row = i; 115 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i)); 116 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON); 117 } 118 119 diag[0] = (PetscScalar)1.0 / (diag[0] + shift); 120 diag += 1; 121 } 122 break; 123 case 2: 124 for (i = 0; i < mbs; i++) { 125 odiag = v + 4 * diag_offset[i]; 126 diag[0] = odiag[0]; 127 diag[1] = odiag[1]; 128 diag[2] = odiag[2]; 129 diag[3] = odiag[3]; 130 PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected)); 131 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 132 diag += 4; 133 } 134 break; 135 case 3: 136 for (i = 0; i < mbs; i++) { 137 odiag = v + 9 * diag_offset[i]; 138 diag[0] = odiag[0]; 139 diag[1] = odiag[1]; 140 diag[2] = odiag[2]; 141 diag[3] = odiag[3]; 142 diag[4] = odiag[4]; 143 diag[5] = odiag[5]; 144 diag[6] = odiag[6]; 145 diag[7] = odiag[7]; 146 diag[8] = odiag[8]; 147 PetscCall(PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected)); 148 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 149 diag += 9; 150 } 151 break; 152 case 4: 153 for (i = 0; i < mbs; i++) { 154 odiag = v + 16 * diag_offset[i]; 155 PetscCall(PetscArraycpy(diag, odiag, 16)); 156 PetscCall(PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected)); 157 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 158 diag += 16; 159 } 160 break; 161 case 5: 162 for (i = 0; i < mbs; i++) { 163 odiag = v + 25 * diag_offset[i]; 164 PetscCall(PetscArraycpy(diag, odiag, 25)); 165 PetscCall(PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 166 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 167 diag += 25; 168 } 169 break; 170 case 6: 171 for (i = 0; i < mbs; i++) { 172 odiag = v + 36 * diag_offset[i]; 173 PetscCall(PetscArraycpy(diag, odiag, 36)); 174 PetscCall(PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected)); 175 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 176 diag += 36; 177 } 178 break; 179 case 7: 180 for (i = 0; i < mbs; i++) { 181 odiag = v + 49 * diag_offset[i]; 182 PetscCall(PetscArraycpy(diag, odiag, 49)); 183 PetscCall(PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected)); 184 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 185 diag += 49; 186 } 187 break; 188 default: 189 PetscCall(PetscMalloc2(bs, &v_work, bs, &v_pivots)); 190 for (i = 0; i < mbs; i++) { 191 odiag = v + bs2 * diag_offset[i]; 192 PetscCall(PetscArraycpy(diag, odiag, bs2)); 193 PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 194 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 195 diag += bs2; 196 } 197 PetscCall(PetscFree2(v_work, v_pivots)); 198 } 199 a->idiagvalid = PETSC_TRUE; 200 PetscFunctionReturn(0); 201 } 202 203 PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) { 204 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 205 PetscScalar *x, *work, *w, *workt, *t; 206 const MatScalar *v, *aa = a->a, *idiag; 207 const PetscScalar *b, *xb; 208 PetscScalar s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */ 209 PetscInt m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it; 210 const PetscInt *diag, *ai = a->i, *aj = a->j, *vi; 211 212 PetscFunctionBegin; 213 its = its * lits; 214 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 215 PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits); 216 PetscCheck(!fshift, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for diagonal shift"); 217 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for non-trivial relaxation factor"); 218 PetscCheck(!(flag & SOR_APPLY_UPPER) && !(flag & SOR_APPLY_LOWER), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for applying upper or lower triangular parts"); 219 220 if (!a->idiagvalid) PetscCall(MatInvertBlockDiagonal(A, NULL)); 221 222 if (!m) PetscFunctionReturn(0); 223 diag = a->diag; 224 idiag = a->idiag; 225 k = PetscMax(A->rmap->n, A->cmap->n); 226 if (!a->mult_work) PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 227 if (!a->sor_workt) PetscCall(PetscMalloc1(k, &a->sor_workt)); 228 if (!a->sor_work) PetscCall(PetscMalloc1(bs, &a->sor_work)); 229 work = a->mult_work; 230 t = a->sor_workt; 231 w = a->sor_work; 232 233 PetscCall(VecGetArray(xx, &x)); 234 PetscCall(VecGetArrayRead(bb, &b)); 235 236 if (flag & SOR_ZERO_INITIAL_GUESS) { 237 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 238 switch (bs) { 239 case 1: 240 PetscKernel_v_gets_A_times_w_1(x, idiag, b); 241 t[0] = b[0]; 242 i2 = 1; 243 idiag += 1; 244 for (i = 1; i < m; i++) { 245 v = aa + ai[i]; 246 vi = aj + ai[i]; 247 nz = diag[i] - ai[i]; 248 s[0] = b[i2]; 249 for (j = 0; j < nz; j++) { 250 xw[0] = x[vi[j]]; 251 PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw); 252 } 253 t[i2] = s[0]; 254 PetscKernel_v_gets_A_times_w_1(xw, idiag, s); 255 x[i2] = xw[0]; 256 idiag += 1; 257 i2 += 1; 258 } 259 break; 260 case 2: 261 PetscKernel_v_gets_A_times_w_2(x, idiag, b); 262 t[0] = b[0]; 263 t[1] = b[1]; 264 i2 = 2; 265 idiag += 4; 266 for (i = 1; i < m; i++) { 267 v = aa + 4 * ai[i]; 268 vi = aj + ai[i]; 269 nz = diag[i] - ai[i]; 270 s[0] = b[i2]; 271 s[1] = b[i2 + 1]; 272 for (j = 0; j < nz; j++) { 273 idx = 2 * vi[j]; 274 it = 4 * j; 275 xw[0] = x[idx]; 276 xw[1] = x[1 + idx]; 277 PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw); 278 } 279 t[i2] = s[0]; 280 t[i2 + 1] = s[1]; 281 PetscKernel_v_gets_A_times_w_2(xw, idiag, s); 282 x[i2] = xw[0]; 283 x[i2 + 1] = xw[1]; 284 idiag += 4; 285 i2 += 2; 286 } 287 break; 288 case 3: 289 PetscKernel_v_gets_A_times_w_3(x, idiag, b); 290 t[0] = b[0]; 291 t[1] = b[1]; 292 t[2] = b[2]; 293 i2 = 3; 294 idiag += 9; 295 for (i = 1; i < m; i++) { 296 v = aa + 9 * ai[i]; 297 vi = aj + ai[i]; 298 nz = diag[i] - ai[i]; 299 s[0] = b[i2]; 300 s[1] = b[i2 + 1]; 301 s[2] = b[i2 + 2]; 302 while (nz--) { 303 idx = 3 * (*vi++); 304 xw[0] = x[idx]; 305 xw[1] = x[1 + idx]; 306 xw[2] = x[2 + idx]; 307 PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw); 308 v += 9; 309 } 310 t[i2] = s[0]; 311 t[i2 + 1] = s[1]; 312 t[i2 + 2] = s[2]; 313 PetscKernel_v_gets_A_times_w_3(xw, idiag, s); 314 x[i2] = xw[0]; 315 x[i2 + 1] = xw[1]; 316 x[i2 + 2] = xw[2]; 317 idiag += 9; 318 i2 += 3; 319 } 320 break; 321 case 4: 322 PetscKernel_v_gets_A_times_w_4(x, idiag, b); 323 t[0] = b[0]; 324 t[1] = b[1]; 325 t[2] = b[2]; 326 t[3] = b[3]; 327 i2 = 4; 328 idiag += 16; 329 for (i = 1; i < m; i++) { 330 v = aa + 16 * ai[i]; 331 vi = aj + ai[i]; 332 nz = diag[i] - ai[i]; 333 s[0] = b[i2]; 334 s[1] = b[i2 + 1]; 335 s[2] = b[i2 + 2]; 336 s[3] = b[i2 + 3]; 337 while (nz--) { 338 idx = 4 * (*vi++); 339 xw[0] = x[idx]; 340 xw[1] = x[1 + idx]; 341 xw[2] = x[2 + idx]; 342 xw[3] = x[3 + idx]; 343 PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw); 344 v += 16; 345 } 346 t[i2] = s[0]; 347 t[i2 + 1] = s[1]; 348 t[i2 + 2] = s[2]; 349 t[i2 + 3] = s[3]; 350 PetscKernel_v_gets_A_times_w_4(xw, idiag, s); 351 x[i2] = xw[0]; 352 x[i2 + 1] = xw[1]; 353 x[i2 + 2] = xw[2]; 354 x[i2 + 3] = xw[3]; 355 idiag += 16; 356 i2 += 4; 357 } 358 break; 359 case 5: 360 PetscKernel_v_gets_A_times_w_5(x, idiag, b); 361 t[0] = b[0]; 362 t[1] = b[1]; 363 t[2] = b[2]; 364 t[3] = b[3]; 365 t[4] = b[4]; 366 i2 = 5; 367 idiag += 25; 368 for (i = 1; i < m; i++) { 369 v = aa + 25 * ai[i]; 370 vi = aj + ai[i]; 371 nz = diag[i] - ai[i]; 372 s[0] = b[i2]; 373 s[1] = b[i2 + 1]; 374 s[2] = b[i2 + 2]; 375 s[3] = b[i2 + 3]; 376 s[4] = b[i2 + 4]; 377 while (nz--) { 378 idx = 5 * (*vi++); 379 xw[0] = x[idx]; 380 xw[1] = x[1 + idx]; 381 xw[2] = x[2 + idx]; 382 xw[3] = x[3 + idx]; 383 xw[4] = x[4 + idx]; 384 PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw); 385 v += 25; 386 } 387 t[i2] = s[0]; 388 t[i2 + 1] = s[1]; 389 t[i2 + 2] = s[2]; 390 t[i2 + 3] = s[3]; 391 t[i2 + 4] = s[4]; 392 PetscKernel_v_gets_A_times_w_5(xw, idiag, s); 393 x[i2] = xw[0]; 394 x[i2 + 1] = xw[1]; 395 x[i2 + 2] = xw[2]; 396 x[i2 + 3] = xw[3]; 397 x[i2 + 4] = xw[4]; 398 idiag += 25; 399 i2 += 5; 400 } 401 break; 402 case 6: 403 PetscKernel_v_gets_A_times_w_6(x, idiag, b); 404 t[0] = b[0]; 405 t[1] = b[1]; 406 t[2] = b[2]; 407 t[3] = b[3]; 408 t[4] = b[4]; 409 t[5] = b[5]; 410 i2 = 6; 411 idiag += 36; 412 for (i = 1; i < m; i++) { 413 v = aa + 36 * ai[i]; 414 vi = aj + ai[i]; 415 nz = diag[i] - ai[i]; 416 s[0] = b[i2]; 417 s[1] = b[i2 + 1]; 418 s[2] = b[i2 + 2]; 419 s[3] = b[i2 + 3]; 420 s[4] = b[i2 + 4]; 421 s[5] = b[i2 + 5]; 422 while (nz--) { 423 idx = 6 * (*vi++); 424 xw[0] = x[idx]; 425 xw[1] = x[1 + idx]; 426 xw[2] = x[2 + idx]; 427 xw[3] = x[3 + idx]; 428 xw[4] = x[4 + idx]; 429 xw[5] = x[5 + idx]; 430 PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw); 431 v += 36; 432 } 433 t[i2] = s[0]; 434 t[i2 + 1] = s[1]; 435 t[i2 + 2] = s[2]; 436 t[i2 + 3] = s[3]; 437 t[i2 + 4] = s[4]; 438 t[i2 + 5] = s[5]; 439 PetscKernel_v_gets_A_times_w_6(xw, idiag, s); 440 x[i2] = xw[0]; 441 x[i2 + 1] = xw[1]; 442 x[i2 + 2] = xw[2]; 443 x[i2 + 3] = xw[3]; 444 x[i2 + 4] = xw[4]; 445 x[i2 + 5] = xw[5]; 446 idiag += 36; 447 i2 += 6; 448 } 449 break; 450 case 7: 451 PetscKernel_v_gets_A_times_w_7(x, idiag, b); 452 t[0] = b[0]; 453 t[1] = b[1]; 454 t[2] = b[2]; 455 t[3] = b[3]; 456 t[4] = b[4]; 457 t[5] = b[5]; 458 t[6] = b[6]; 459 i2 = 7; 460 idiag += 49; 461 for (i = 1; i < m; i++) { 462 v = aa + 49 * ai[i]; 463 vi = aj + ai[i]; 464 nz = diag[i] - ai[i]; 465 s[0] = b[i2]; 466 s[1] = b[i2 + 1]; 467 s[2] = b[i2 + 2]; 468 s[3] = b[i2 + 3]; 469 s[4] = b[i2 + 4]; 470 s[5] = b[i2 + 5]; 471 s[6] = b[i2 + 6]; 472 while (nz--) { 473 idx = 7 * (*vi++); 474 xw[0] = x[idx]; 475 xw[1] = x[1 + idx]; 476 xw[2] = x[2 + idx]; 477 xw[3] = x[3 + idx]; 478 xw[4] = x[4 + idx]; 479 xw[5] = x[5 + idx]; 480 xw[6] = x[6 + idx]; 481 PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw); 482 v += 49; 483 } 484 t[i2] = s[0]; 485 t[i2 + 1] = s[1]; 486 t[i2 + 2] = s[2]; 487 t[i2 + 3] = s[3]; 488 t[i2 + 4] = s[4]; 489 t[i2 + 5] = s[5]; 490 t[i2 + 6] = s[6]; 491 PetscKernel_v_gets_A_times_w_7(xw, idiag, s); 492 x[i2] = xw[0]; 493 x[i2 + 1] = xw[1]; 494 x[i2 + 2] = xw[2]; 495 x[i2 + 3] = xw[3]; 496 x[i2 + 4] = xw[4]; 497 x[i2 + 5] = xw[5]; 498 x[i2 + 6] = xw[6]; 499 idiag += 49; 500 i2 += 7; 501 } 502 break; 503 default: 504 PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x); 505 PetscCall(PetscArraycpy(t, b, bs)); 506 i2 = bs; 507 idiag += bs2; 508 for (i = 1; i < m; i++) { 509 v = aa + bs2 * ai[i]; 510 vi = aj + ai[i]; 511 nz = diag[i] - ai[i]; 512 513 PetscCall(PetscArraycpy(w, b + i2, bs)); 514 /* copy all rows of x that are needed into contiguous space */ 515 workt = work; 516 for (j = 0; j < nz; j++) { 517 PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs)); 518 workt += bs; 519 } 520 PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work); 521 PetscCall(PetscArraycpy(t + i2, w, bs)); 522 PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2); 523 524 idiag += bs2; 525 i2 += bs; 526 } 527 break; 528 } 529 /* for logging purposes assume number of nonzero in lower half is 1/2 of total */ 530 PetscCall(PetscLogFlops(1.0 * bs2 * a->nz)); 531 xb = t; 532 } else xb = b; 533 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 534 idiag = a->idiag + bs2 * (a->mbs - 1); 535 i2 = bs * (m - 1); 536 switch (bs) { 537 case 1: 538 s[0] = xb[i2]; 539 PetscKernel_v_gets_A_times_w_1(xw, idiag, s); 540 x[i2] = xw[0]; 541 i2 -= 1; 542 for (i = m - 2; i >= 0; i--) { 543 v = aa + (diag[i] + 1); 544 vi = aj + diag[i] + 1; 545 nz = ai[i + 1] - diag[i] - 1; 546 s[0] = xb[i2]; 547 for (j = 0; j < nz; j++) { 548 xw[0] = x[vi[j]]; 549 PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw); 550 } 551 PetscKernel_v_gets_A_times_w_1(xw, idiag, s); 552 x[i2] = xw[0]; 553 idiag -= 1; 554 i2 -= 1; 555 } 556 break; 557 case 2: 558 s[0] = xb[i2]; 559 s[1] = xb[i2 + 1]; 560 PetscKernel_v_gets_A_times_w_2(xw, idiag, s); 561 x[i2] = xw[0]; 562 x[i2 + 1] = xw[1]; 563 i2 -= 2; 564 idiag -= 4; 565 for (i = m - 2; i >= 0; i--) { 566 v = aa + 4 * (diag[i] + 1); 567 vi = aj + diag[i] + 1; 568 nz = ai[i + 1] - diag[i] - 1; 569 s[0] = xb[i2]; 570 s[1] = xb[i2 + 1]; 571 for (j = 0; j < nz; j++) { 572 idx = 2 * vi[j]; 573 it = 4 * j; 574 xw[0] = x[idx]; 575 xw[1] = x[1 + idx]; 576 PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw); 577 } 578 PetscKernel_v_gets_A_times_w_2(xw, idiag, s); 579 x[i2] = xw[0]; 580 x[i2 + 1] = xw[1]; 581 idiag -= 4; 582 i2 -= 2; 583 } 584 break; 585 case 3: 586 s[0] = xb[i2]; 587 s[1] = xb[i2 + 1]; 588 s[2] = xb[i2 + 2]; 589 PetscKernel_v_gets_A_times_w_3(xw, idiag, s); 590 x[i2] = xw[0]; 591 x[i2 + 1] = xw[1]; 592 x[i2 + 2] = xw[2]; 593 i2 -= 3; 594 idiag -= 9; 595 for (i = m - 2; i >= 0; i--) { 596 v = aa + 9 * (diag[i] + 1); 597 vi = aj + diag[i] + 1; 598 nz = ai[i + 1] - diag[i] - 1; 599 s[0] = xb[i2]; 600 s[1] = xb[i2 + 1]; 601 s[2] = xb[i2 + 2]; 602 while (nz--) { 603 idx = 3 * (*vi++); 604 xw[0] = x[idx]; 605 xw[1] = x[1 + idx]; 606 xw[2] = x[2 + idx]; 607 PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw); 608 v += 9; 609 } 610 PetscKernel_v_gets_A_times_w_3(xw, idiag, s); 611 x[i2] = xw[0]; 612 x[i2 + 1] = xw[1]; 613 x[i2 + 2] = xw[2]; 614 idiag -= 9; 615 i2 -= 3; 616 } 617 break; 618 case 4: 619 s[0] = xb[i2]; 620 s[1] = xb[i2 + 1]; 621 s[2] = xb[i2 + 2]; 622 s[3] = xb[i2 + 3]; 623 PetscKernel_v_gets_A_times_w_4(xw, idiag, s); 624 x[i2] = xw[0]; 625 x[i2 + 1] = xw[1]; 626 x[i2 + 2] = xw[2]; 627 x[i2 + 3] = xw[3]; 628 i2 -= 4; 629 idiag -= 16; 630 for (i = m - 2; i >= 0; i--) { 631 v = aa + 16 * (diag[i] + 1); 632 vi = aj + diag[i] + 1; 633 nz = ai[i + 1] - diag[i] - 1; 634 s[0] = xb[i2]; 635 s[1] = xb[i2 + 1]; 636 s[2] = xb[i2 + 2]; 637 s[3] = xb[i2 + 3]; 638 while (nz--) { 639 idx = 4 * (*vi++); 640 xw[0] = x[idx]; 641 xw[1] = x[1 + idx]; 642 xw[2] = x[2 + idx]; 643 xw[3] = x[3 + idx]; 644 PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw); 645 v += 16; 646 } 647 PetscKernel_v_gets_A_times_w_4(xw, idiag, s); 648 x[i2] = xw[0]; 649 x[i2 + 1] = xw[1]; 650 x[i2 + 2] = xw[2]; 651 x[i2 + 3] = xw[3]; 652 idiag -= 16; 653 i2 -= 4; 654 } 655 break; 656 case 5: 657 s[0] = xb[i2]; 658 s[1] = xb[i2 + 1]; 659 s[2] = xb[i2 + 2]; 660 s[3] = xb[i2 + 3]; 661 s[4] = xb[i2 + 4]; 662 PetscKernel_v_gets_A_times_w_5(xw, idiag, s); 663 x[i2] = xw[0]; 664 x[i2 + 1] = xw[1]; 665 x[i2 + 2] = xw[2]; 666 x[i2 + 3] = xw[3]; 667 x[i2 + 4] = xw[4]; 668 i2 -= 5; 669 idiag -= 25; 670 for (i = m - 2; i >= 0; i--) { 671 v = aa + 25 * (diag[i] + 1); 672 vi = aj + diag[i] + 1; 673 nz = ai[i + 1] - diag[i] - 1; 674 s[0] = xb[i2]; 675 s[1] = xb[i2 + 1]; 676 s[2] = xb[i2 + 2]; 677 s[3] = xb[i2 + 3]; 678 s[4] = xb[i2 + 4]; 679 while (nz--) { 680 idx = 5 * (*vi++); 681 xw[0] = x[idx]; 682 xw[1] = x[1 + idx]; 683 xw[2] = x[2 + idx]; 684 xw[3] = x[3 + idx]; 685 xw[4] = x[4 + idx]; 686 PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw); 687 v += 25; 688 } 689 PetscKernel_v_gets_A_times_w_5(xw, idiag, s); 690 x[i2] = xw[0]; 691 x[i2 + 1] = xw[1]; 692 x[i2 + 2] = xw[2]; 693 x[i2 + 3] = xw[3]; 694 x[i2 + 4] = xw[4]; 695 idiag -= 25; 696 i2 -= 5; 697 } 698 break; 699 case 6: 700 s[0] = xb[i2]; 701 s[1] = xb[i2 + 1]; 702 s[2] = xb[i2 + 2]; 703 s[3] = xb[i2 + 3]; 704 s[4] = xb[i2 + 4]; 705 s[5] = xb[i2 + 5]; 706 PetscKernel_v_gets_A_times_w_6(xw, idiag, s); 707 x[i2] = xw[0]; 708 x[i2 + 1] = xw[1]; 709 x[i2 + 2] = xw[2]; 710 x[i2 + 3] = xw[3]; 711 x[i2 + 4] = xw[4]; 712 x[i2 + 5] = xw[5]; 713 i2 -= 6; 714 idiag -= 36; 715 for (i = m - 2; i >= 0; i--) { 716 v = aa + 36 * (diag[i] + 1); 717 vi = aj + diag[i] + 1; 718 nz = ai[i + 1] - diag[i] - 1; 719 s[0] = xb[i2]; 720 s[1] = xb[i2 + 1]; 721 s[2] = xb[i2 + 2]; 722 s[3] = xb[i2 + 3]; 723 s[4] = xb[i2 + 4]; 724 s[5] = xb[i2 + 5]; 725 while (nz--) { 726 idx = 6 * (*vi++); 727 xw[0] = x[idx]; 728 xw[1] = x[1 + idx]; 729 xw[2] = x[2 + idx]; 730 xw[3] = x[3 + idx]; 731 xw[4] = x[4 + idx]; 732 xw[5] = x[5 + idx]; 733 PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw); 734 v += 36; 735 } 736 PetscKernel_v_gets_A_times_w_6(xw, idiag, s); 737 x[i2] = xw[0]; 738 x[i2 + 1] = xw[1]; 739 x[i2 + 2] = xw[2]; 740 x[i2 + 3] = xw[3]; 741 x[i2 + 4] = xw[4]; 742 x[i2 + 5] = xw[5]; 743 idiag -= 36; 744 i2 -= 6; 745 } 746 break; 747 case 7: 748 s[0] = xb[i2]; 749 s[1] = xb[i2 + 1]; 750 s[2] = xb[i2 + 2]; 751 s[3] = xb[i2 + 3]; 752 s[4] = xb[i2 + 4]; 753 s[5] = xb[i2 + 5]; 754 s[6] = xb[i2 + 6]; 755 PetscKernel_v_gets_A_times_w_7(x, idiag, b); 756 x[i2] = xw[0]; 757 x[i2 + 1] = xw[1]; 758 x[i2 + 2] = xw[2]; 759 x[i2 + 3] = xw[3]; 760 x[i2 + 4] = xw[4]; 761 x[i2 + 5] = xw[5]; 762 x[i2 + 6] = xw[6]; 763 i2 -= 7; 764 idiag -= 49; 765 for (i = m - 2; i >= 0; i--) { 766 v = aa + 49 * (diag[i] + 1); 767 vi = aj + diag[i] + 1; 768 nz = ai[i + 1] - diag[i] - 1; 769 s[0] = xb[i2]; 770 s[1] = xb[i2 + 1]; 771 s[2] = xb[i2 + 2]; 772 s[3] = xb[i2 + 3]; 773 s[4] = xb[i2 + 4]; 774 s[5] = xb[i2 + 5]; 775 s[6] = xb[i2 + 6]; 776 while (nz--) { 777 idx = 7 * (*vi++); 778 xw[0] = x[idx]; 779 xw[1] = x[1 + idx]; 780 xw[2] = x[2 + idx]; 781 xw[3] = x[3 + idx]; 782 xw[4] = x[4 + idx]; 783 xw[5] = x[5 + idx]; 784 xw[6] = x[6 + idx]; 785 PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw); 786 v += 49; 787 } 788 PetscKernel_v_gets_A_times_w_7(xw, idiag, s); 789 x[i2] = xw[0]; 790 x[i2 + 1] = xw[1]; 791 x[i2 + 2] = xw[2]; 792 x[i2 + 3] = xw[3]; 793 x[i2 + 4] = xw[4]; 794 x[i2 + 5] = xw[5]; 795 x[i2 + 6] = xw[6]; 796 idiag -= 49; 797 i2 -= 7; 798 } 799 break; 800 default: 801 PetscCall(PetscArraycpy(w, xb + i2, bs)); 802 PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2); 803 i2 -= bs; 804 idiag -= bs2; 805 for (i = m - 2; i >= 0; i--) { 806 v = aa + bs2 * (diag[i] + 1); 807 vi = aj + diag[i] + 1; 808 nz = ai[i + 1] - diag[i] - 1; 809 810 PetscCall(PetscArraycpy(w, xb + i2, bs)); 811 /* copy all rows of x that are needed into contiguous space */ 812 workt = work; 813 for (j = 0; j < nz; j++) { 814 PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs)); 815 workt += bs; 816 } 817 PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work); 818 PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2); 819 820 idiag -= bs2; 821 i2 -= bs; 822 } 823 break; 824 } 825 PetscCall(PetscLogFlops(1.0 * bs2 * (a->nz))); 826 } 827 its--; 828 } 829 while (its--) { 830 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 831 idiag = a->idiag; 832 i2 = 0; 833 switch (bs) { 834 case 1: 835 for (i = 0; i < m; i++) { 836 v = aa + ai[i]; 837 vi = aj + ai[i]; 838 nz = ai[i + 1] - ai[i]; 839 s[0] = b[i2]; 840 for (j = 0; j < nz; j++) { 841 xw[0] = x[vi[j]]; 842 PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw); 843 } 844 PetscKernel_v_gets_A_times_w_1(xw, idiag, s); 845 x[i2] += xw[0]; 846 idiag += 1; 847 i2 += 1; 848 } 849 break; 850 case 2: 851 for (i = 0; i < m; i++) { 852 v = aa + 4 * ai[i]; 853 vi = aj + ai[i]; 854 nz = ai[i + 1] - ai[i]; 855 s[0] = b[i2]; 856 s[1] = b[i2 + 1]; 857 for (j = 0; j < nz; j++) { 858 idx = 2 * vi[j]; 859 it = 4 * j; 860 xw[0] = x[idx]; 861 xw[1] = x[1 + idx]; 862 PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw); 863 } 864 PetscKernel_v_gets_A_times_w_2(xw, idiag, s); 865 x[i2] += xw[0]; 866 x[i2 + 1] += xw[1]; 867 idiag += 4; 868 i2 += 2; 869 } 870 break; 871 case 3: 872 for (i = 0; i < m; i++) { 873 v = aa + 9 * ai[i]; 874 vi = aj + ai[i]; 875 nz = ai[i + 1] - ai[i]; 876 s[0] = b[i2]; 877 s[1] = b[i2 + 1]; 878 s[2] = b[i2 + 2]; 879 while (nz--) { 880 idx = 3 * (*vi++); 881 xw[0] = x[idx]; 882 xw[1] = x[1 + idx]; 883 xw[2] = x[2 + idx]; 884 PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw); 885 v += 9; 886 } 887 PetscKernel_v_gets_A_times_w_3(xw, idiag, s); 888 x[i2] += xw[0]; 889 x[i2 + 1] += xw[1]; 890 x[i2 + 2] += xw[2]; 891 idiag += 9; 892 i2 += 3; 893 } 894 break; 895 case 4: 896 for (i = 0; i < m; i++) { 897 v = aa + 16 * ai[i]; 898 vi = aj + ai[i]; 899 nz = ai[i + 1] - ai[i]; 900 s[0] = b[i2]; 901 s[1] = b[i2 + 1]; 902 s[2] = b[i2 + 2]; 903 s[3] = b[i2 + 3]; 904 while (nz--) { 905 idx = 4 * (*vi++); 906 xw[0] = x[idx]; 907 xw[1] = x[1 + idx]; 908 xw[2] = x[2 + idx]; 909 xw[3] = x[3 + idx]; 910 PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw); 911 v += 16; 912 } 913 PetscKernel_v_gets_A_times_w_4(xw, idiag, s); 914 x[i2] += xw[0]; 915 x[i2 + 1] += xw[1]; 916 x[i2 + 2] += xw[2]; 917 x[i2 + 3] += xw[3]; 918 idiag += 16; 919 i2 += 4; 920 } 921 break; 922 case 5: 923 for (i = 0; i < m; i++) { 924 v = aa + 25 * ai[i]; 925 vi = aj + ai[i]; 926 nz = ai[i + 1] - ai[i]; 927 s[0] = b[i2]; 928 s[1] = b[i2 + 1]; 929 s[2] = b[i2 + 2]; 930 s[3] = b[i2 + 3]; 931 s[4] = b[i2 + 4]; 932 while (nz--) { 933 idx = 5 * (*vi++); 934 xw[0] = x[idx]; 935 xw[1] = x[1 + idx]; 936 xw[2] = x[2 + idx]; 937 xw[3] = x[3 + idx]; 938 xw[4] = x[4 + idx]; 939 PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw); 940 v += 25; 941 } 942 PetscKernel_v_gets_A_times_w_5(xw, idiag, s); 943 x[i2] += xw[0]; 944 x[i2 + 1] += xw[1]; 945 x[i2 + 2] += xw[2]; 946 x[i2 + 3] += xw[3]; 947 x[i2 + 4] += xw[4]; 948 idiag += 25; 949 i2 += 5; 950 } 951 break; 952 case 6: 953 for (i = 0; i < m; i++) { 954 v = aa + 36 * ai[i]; 955 vi = aj + ai[i]; 956 nz = ai[i + 1] - ai[i]; 957 s[0] = b[i2]; 958 s[1] = b[i2 + 1]; 959 s[2] = b[i2 + 2]; 960 s[3] = b[i2 + 3]; 961 s[4] = b[i2 + 4]; 962 s[5] = b[i2 + 5]; 963 while (nz--) { 964 idx = 6 * (*vi++); 965 xw[0] = x[idx]; 966 xw[1] = x[1 + idx]; 967 xw[2] = x[2 + idx]; 968 xw[3] = x[3 + idx]; 969 xw[4] = x[4 + idx]; 970 xw[5] = x[5 + idx]; 971 PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw); 972 v += 36; 973 } 974 PetscKernel_v_gets_A_times_w_6(xw, idiag, s); 975 x[i2] += xw[0]; 976 x[i2 + 1] += xw[1]; 977 x[i2 + 2] += xw[2]; 978 x[i2 + 3] += xw[3]; 979 x[i2 + 4] += xw[4]; 980 x[i2 + 5] += xw[5]; 981 idiag += 36; 982 i2 += 6; 983 } 984 break; 985 case 7: 986 for (i = 0; i < m; i++) { 987 v = aa + 49 * ai[i]; 988 vi = aj + ai[i]; 989 nz = ai[i + 1] - ai[i]; 990 s[0] = b[i2]; 991 s[1] = b[i2 + 1]; 992 s[2] = b[i2 + 2]; 993 s[3] = b[i2 + 3]; 994 s[4] = b[i2 + 4]; 995 s[5] = b[i2 + 5]; 996 s[6] = b[i2 + 6]; 997 while (nz--) { 998 idx = 7 * (*vi++); 999 xw[0] = x[idx]; 1000 xw[1] = x[1 + idx]; 1001 xw[2] = x[2 + idx]; 1002 xw[3] = x[3 + idx]; 1003 xw[4] = x[4 + idx]; 1004 xw[5] = x[5 + idx]; 1005 xw[6] = x[6 + idx]; 1006 PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw); 1007 v += 49; 1008 } 1009 PetscKernel_v_gets_A_times_w_7(xw, idiag, s); 1010 x[i2] += xw[0]; 1011 x[i2 + 1] += xw[1]; 1012 x[i2 + 2] += xw[2]; 1013 x[i2 + 3] += xw[3]; 1014 x[i2 + 4] += xw[4]; 1015 x[i2 + 5] += xw[5]; 1016 x[i2 + 6] += xw[6]; 1017 idiag += 49; 1018 i2 += 7; 1019 } 1020 break; 1021 default: 1022 for (i = 0; i < m; i++) { 1023 v = aa + bs2 * ai[i]; 1024 vi = aj + ai[i]; 1025 nz = ai[i + 1] - ai[i]; 1026 1027 PetscCall(PetscArraycpy(w, b + i2, bs)); 1028 /* copy all rows of x that are needed into contiguous space */ 1029 workt = work; 1030 for (j = 0; j < nz; j++) { 1031 PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs)); 1032 workt += bs; 1033 } 1034 PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work); 1035 PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2); 1036 1037 idiag += bs2; 1038 i2 += bs; 1039 } 1040 break; 1041 } 1042 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1043 } 1044 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 1045 idiag = a->idiag + bs2 * (a->mbs - 1); 1046 i2 = bs * (m - 1); 1047 switch (bs) { 1048 case 1: 1049 for (i = m - 1; i >= 0; i--) { 1050 v = aa + ai[i]; 1051 vi = aj + ai[i]; 1052 nz = ai[i + 1] - ai[i]; 1053 s[0] = b[i2]; 1054 for (j = 0; j < nz; j++) { 1055 xw[0] = x[vi[j]]; 1056 PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw); 1057 } 1058 PetscKernel_v_gets_A_times_w_1(xw, idiag, s); 1059 x[i2] += xw[0]; 1060 idiag -= 1; 1061 i2 -= 1; 1062 } 1063 break; 1064 case 2: 1065 for (i = m - 1; i >= 0; i--) { 1066 v = aa + 4 * ai[i]; 1067 vi = aj + ai[i]; 1068 nz = ai[i + 1] - ai[i]; 1069 s[0] = b[i2]; 1070 s[1] = b[i2 + 1]; 1071 for (j = 0; j < nz; j++) { 1072 idx = 2 * vi[j]; 1073 it = 4 * j; 1074 xw[0] = x[idx]; 1075 xw[1] = x[1 + idx]; 1076 PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw); 1077 } 1078 PetscKernel_v_gets_A_times_w_2(xw, idiag, s); 1079 x[i2] += xw[0]; 1080 x[i2 + 1] += xw[1]; 1081 idiag -= 4; 1082 i2 -= 2; 1083 } 1084 break; 1085 case 3: 1086 for (i = m - 1; i >= 0; i--) { 1087 v = aa + 9 * ai[i]; 1088 vi = aj + ai[i]; 1089 nz = ai[i + 1] - ai[i]; 1090 s[0] = b[i2]; 1091 s[1] = b[i2 + 1]; 1092 s[2] = b[i2 + 2]; 1093 while (nz--) { 1094 idx = 3 * (*vi++); 1095 xw[0] = x[idx]; 1096 xw[1] = x[1 + idx]; 1097 xw[2] = x[2 + idx]; 1098 PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw); 1099 v += 9; 1100 } 1101 PetscKernel_v_gets_A_times_w_3(xw, idiag, s); 1102 x[i2] += xw[0]; 1103 x[i2 + 1] += xw[1]; 1104 x[i2 + 2] += xw[2]; 1105 idiag -= 9; 1106 i2 -= 3; 1107 } 1108 break; 1109 case 4: 1110 for (i = m - 1; i >= 0; i--) { 1111 v = aa + 16 * ai[i]; 1112 vi = aj + ai[i]; 1113 nz = ai[i + 1] - ai[i]; 1114 s[0] = b[i2]; 1115 s[1] = b[i2 + 1]; 1116 s[2] = b[i2 + 2]; 1117 s[3] = b[i2 + 3]; 1118 while (nz--) { 1119 idx = 4 * (*vi++); 1120 xw[0] = x[idx]; 1121 xw[1] = x[1 + idx]; 1122 xw[2] = x[2 + idx]; 1123 xw[3] = x[3 + idx]; 1124 PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw); 1125 v += 16; 1126 } 1127 PetscKernel_v_gets_A_times_w_4(xw, idiag, s); 1128 x[i2] += xw[0]; 1129 x[i2 + 1] += xw[1]; 1130 x[i2 + 2] += xw[2]; 1131 x[i2 + 3] += xw[3]; 1132 idiag -= 16; 1133 i2 -= 4; 1134 } 1135 break; 1136 case 5: 1137 for (i = m - 1; i >= 0; i--) { 1138 v = aa + 25 * ai[i]; 1139 vi = aj + ai[i]; 1140 nz = ai[i + 1] - ai[i]; 1141 s[0] = b[i2]; 1142 s[1] = b[i2 + 1]; 1143 s[2] = b[i2 + 2]; 1144 s[3] = b[i2 + 3]; 1145 s[4] = b[i2 + 4]; 1146 while (nz--) { 1147 idx = 5 * (*vi++); 1148 xw[0] = x[idx]; 1149 xw[1] = x[1 + idx]; 1150 xw[2] = x[2 + idx]; 1151 xw[3] = x[3 + idx]; 1152 xw[4] = x[4 + idx]; 1153 PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw); 1154 v += 25; 1155 } 1156 PetscKernel_v_gets_A_times_w_5(xw, idiag, s); 1157 x[i2] += xw[0]; 1158 x[i2 + 1] += xw[1]; 1159 x[i2 + 2] += xw[2]; 1160 x[i2 + 3] += xw[3]; 1161 x[i2 + 4] += xw[4]; 1162 idiag -= 25; 1163 i2 -= 5; 1164 } 1165 break; 1166 case 6: 1167 for (i = m - 1; i >= 0; i--) { 1168 v = aa + 36 * ai[i]; 1169 vi = aj + ai[i]; 1170 nz = ai[i + 1] - ai[i]; 1171 s[0] = b[i2]; 1172 s[1] = b[i2 + 1]; 1173 s[2] = b[i2 + 2]; 1174 s[3] = b[i2 + 3]; 1175 s[4] = b[i2 + 4]; 1176 s[5] = b[i2 + 5]; 1177 while (nz--) { 1178 idx = 6 * (*vi++); 1179 xw[0] = x[idx]; 1180 xw[1] = x[1 + idx]; 1181 xw[2] = x[2 + idx]; 1182 xw[3] = x[3 + idx]; 1183 xw[4] = x[4 + idx]; 1184 xw[5] = x[5 + idx]; 1185 PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw); 1186 v += 36; 1187 } 1188 PetscKernel_v_gets_A_times_w_6(xw, idiag, s); 1189 x[i2] += xw[0]; 1190 x[i2 + 1] += xw[1]; 1191 x[i2 + 2] += xw[2]; 1192 x[i2 + 3] += xw[3]; 1193 x[i2 + 4] += xw[4]; 1194 x[i2 + 5] += xw[5]; 1195 idiag -= 36; 1196 i2 -= 6; 1197 } 1198 break; 1199 case 7: 1200 for (i = m - 1; i >= 0; i--) { 1201 v = aa + 49 * ai[i]; 1202 vi = aj + ai[i]; 1203 nz = ai[i + 1] - ai[i]; 1204 s[0] = b[i2]; 1205 s[1] = b[i2 + 1]; 1206 s[2] = b[i2 + 2]; 1207 s[3] = b[i2 + 3]; 1208 s[4] = b[i2 + 4]; 1209 s[5] = b[i2 + 5]; 1210 s[6] = b[i2 + 6]; 1211 while (nz--) { 1212 idx = 7 * (*vi++); 1213 xw[0] = x[idx]; 1214 xw[1] = x[1 + idx]; 1215 xw[2] = x[2 + idx]; 1216 xw[3] = x[3 + idx]; 1217 xw[4] = x[4 + idx]; 1218 xw[5] = x[5 + idx]; 1219 xw[6] = x[6 + idx]; 1220 PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw); 1221 v += 49; 1222 } 1223 PetscKernel_v_gets_A_times_w_7(xw, idiag, s); 1224 x[i2] += xw[0]; 1225 x[i2 + 1] += xw[1]; 1226 x[i2 + 2] += xw[2]; 1227 x[i2 + 3] += xw[3]; 1228 x[i2 + 4] += xw[4]; 1229 x[i2 + 5] += xw[5]; 1230 x[i2 + 6] += xw[6]; 1231 idiag -= 49; 1232 i2 -= 7; 1233 } 1234 break; 1235 default: 1236 for (i = m - 1; i >= 0; i--) { 1237 v = aa + bs2 * ai[i]; 1238 vi = aj + ai[i]; 1239 nz = ai[i + 1] - ai[i]; 1240 1241 PetscCall(PetscArraycpy(w, b + i2, bs)); 1242 /* copy all rows of x that are needed into contiguous space */ 1243 workt = work; 1244 for (j = 0; j < nz; j++) { 1245 PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs)); 1246 workt += bs; 1247 } 1248 PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work); 1249 PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2); 1250 1251 idiag -= bs2; 1252 i2 -= bs; 1253 } 1254 break; 1255 } 1256 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz))); 1257 } 1258 } 1259 PetscCall(VecRestoreArray(xx, &x)); 1260 PetscCall(VecRestoreArrayRead(bb, &b)); 1261 PetscFunctionReturn(0); 1262 } 1263 1264 /* 1265 Special version for direct calls from Fortran (Used in PETSc-fun3d) 1266 */ 1267 #if defined(PETSC_HAVE_FORTRAN_CAPS) 1268 #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4 1269 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE) 1270 #define matsetvaluesblocked4_ matsetvaluesblocked4 1271 #endif 1272 1273 PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[]) { 1274 Mat A = *AA; 1275 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1276 PetscInt *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn; 1277 PetscInt *ai = a->i, *ailen = a->ilen; 1278 PetscInt *aj = a->j, stepval, lastcol = -1; 1279 const PetscScalar *value = v; 1280 MatScalar *ap, *aa = a->a, *bap; 1281 1282 PetscFunctionBegin; 1283 if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4"); 1284 stepval = (n - 1) * 4; 1285 for (k = 0; k < m; k++) { /* loop over added rows */ 1286 row = im[k]; 1287 rp = aj + ai[row]; 1288 ap = aa + 16 * ai[row]; 1289 nrow = ailen[row]; 1290 low = 0; 1291 high = nrow; 1292 for (l = 0; l < n; l++) { /* loop over added columns */ 1293 col = in[l]; 1294 if (col <= lastcol) low = 0; 1295 else high = nrow; 1296 lastcol = col; 1297 value = v + k * (stepval + 4 + l) * 4; 1298 while (high - low > 7) { 1299 t = (low + high) / 2; 1300 if (rp[t] > col) high = t; 1301 else low = t; 1302 } 1303 for (i = low; i < high; i++) { 1304 if (rp[i] > col) break; 1305 if (rp[i] == col) { 1306 bap = ap + 16 * i; 1307 for (ii = 0; ii < 4; ii++, value += stepval) { 1308 for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++; 1309 } 1310 goto noinsert2; 1311 } 1312 } 1313 N = nrow++ - 1; 1314 high++; /* added new column index thus must search to one higher than before */ 1315 /* shift up all the later entries in this row */ 1316 for (ii = N; ii >= i; ii--) { 1317 rp[ii + 1] = rp[ii]; 1318 PetscCallVoid(PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16)); 1319 } 1320 if (N >= i) PetscCallVoid(PetscArrayzero(ap + 16 * i, 16)); 1321 rp[i] = col; 1322 bap = ap + 16 * i; 1323 for (ii = 0; ii < 4; ii++, value += stepval) { 1324 for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++; 1325 } 1326 noinsert2:; 1327 low = i; 1328 } 1329 ailen[row] = nrow; 1330 } 1331 PetscFunctionReturnVoid(); 1332 } 1333 1334 #if defined(PETSC_HAVE_FORTRAN_CAPS) 1335 #define matsetvalues4_ MATSETVALUES4 1336 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE) 1337 #define matsetvalues4_ matsetvalues4 1338 #endif 1339 1340 PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v) { 1341 Mat A = *AA; 1342 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1343 PetscInt *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm; 1344 PetscInt *ai = a->i, *ailen = a->ilen; 1345 PetscInt *aj = a->j, brow, bcol; 1346 PetscInt ridx, cidx, lastcol = -1; 1347 MatScalar *ap, value, *aa = a->a, *bap; 1348 1349 PetscFunctionBegin; 1350 for (k = 0; k < m; k++) { /* loop over added rows */ 1351 row = im[k]; 1352 brow = row / 4; 1353 rp = aj + ai[brow]; 1354 ap = aa + 16 * ai[brow]; 1355 nrow = ailen[brow]; 1356 low = 0; 1357 high = nrow; 1358 for (l = 0; l < n; l++) { /* loop over added columns */ 1359 col = in[l]; 1360 bcol = col / 4; 1361 ridx = row % 4; 1362 cidx = col % 4; 1363 value = v[l + k * n]; 1364 if (col <= lastcol) low = 0; 1365 else high = nrow; 1366 lastcol = col; 1367 while (high - low > 7) { 1368 t = (low + high) / 2; 1369 if (rp[t] > bcol) high = t; 1370 else low = t; 1371 } 1372 for (i = low; i < high; i++) { 1373 if (rp[i] > bcol) break; 1374 if (rp[i] == bcol) { 1375 bap = ap + 16 * i + 4 * cidx + ridx; 1376 *bap += value; 1377 goto noinsert1; 1378 } 1379 } 1380 N = nrow++ - 1; 1381 high++; /* added new column thus must search to one higher than before */ 1382 /* shift up all the later entries in this row */ 1383 PetscCallVoid(PetscArraymove(rp + i + 1, rp + i, N - i + 1)); 1384 PetscCallVoid(PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1))); 1385 PetscCallVoid(PetscArrayzero(ap + 16 * i, 16)); 1386 rp[i] = bcol; 1387 ap[16 * i + 4 * cidx + ridx] = value; 1388 noinsert1:; 1389 low = i; 1390 } 1391 ailen[brow] = nrow; 1392 } 1393 PetscFunctionReturnVoid(); 1394 } 1395 1396 /* 1397 Checks for missing diagonals 1398 */ 1399 PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A, PetscBool *missing, PetscInt *d) { 1400 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1401 PetscInt *diag, *ii = a->i, i; 1402 1403 PetscFunctionBegin; 1404 PetscCall(MatMarkDiagonal_SeqBAIJ(A)); 1405 *missing = PETSC_FALSE; 1406 if (A->rmap->n > 0 && !ii) { 1407 *missing = PETSC_TRUE; 1408 if (d) *d = 0; 1409 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 1410 } else { 1411 PetscInt n; 1412 n = PetscMin(a->mbs, a->nbs); 1413 diag = a->diag; 1414 for (i = 0; i < n; i++) { 1415 if (diag[i] >= ii[i + 1]) { 1416 *missing = PETSC_TRUE; 1417 if (d) *d = i; 1418 PetscCall(PetscInfo(A, "Matrix is missing block diagonal number %" PetscInt_FMT "\n", i)); 1419 break; 1420 } 1421 } 1422 } 1423 PetscFunctionReturn(0); 1424 } 1425 1426 PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A) { 1427 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1428 PetscInt i, j, m = a->mbs; 1429 1430 PetscFunctionBegin; 1431 if (!a->diag) { 1432 PetscCall(PetscMalloc1(m, &a->diag)); 1433 a->free_diag = PETSC_TRUE; 1434 } 1435 for (i = 0; i < m; i++) { 1436 a->diag[i] = a->i[i + 1]; 1437 for (j = a->i[i]; j < a->i[i + 1]; j++) { 1438 if (a->j[j] == i) { 1439 a->diag[i] = j; 1440 break; 1441 } 1442 } 1443 } 1444 PetscFunctionReturn(0); 1445 } 1446 1447 static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done) { 1448 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1449 PetscInt i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt; 1450 PetscInt **ia = (PetscInt **)inia, **ja = (PetscInt **)inja; 1451 1452 PetscFunctionBegin; 1453 *nn = n; 1454 if (!ia) PetscFunctionReturn(0); 1455 if (symmetric) { 1456 PetscCall(MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja)); 1457 nz = tia[n]; 1458 } else { 1459 tia = a->i; 1460 tja = a->j; 1461 } 1462 1463 if (!blockcompressed && bs > 1) { 1464 (*nn) *= bs; 1465 /* malloc & create the natural set of indices */ 1466 PetscCall(PetscMalloc1((n + 1) * bs, ia)); 1467 if (n) { 1468 (*ia)[0] = oshift; 1469 for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1]; 1470 } 1471 1472 for (i = 1; i < n; i++) { 1473 (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1]; 1474 for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1]; 1475 } 1476 if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1]; 1477 1478 if (inja) { 1479 PetscCall(PetscMalloc1(nz * bs * bs, ja)); 1480 cnt = 0; 1481 for (i = 0; i < n; i++) { 1482 for (j = 0; j < bs; j++) { 1483 for (k = tia[i]; k < tia[i + 1]; k++) { 1484 for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l; 1485 } 1486 } 1487 } 1488 } 1489 1490 if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */ 1491 PetscCall(PetscFree(tia)); 1492 PetscCall(PetscFree(tja)); 1493 } 1494 } else if (oshift == 1) { 1495 if (symmetric) { 1496 nz = tia[A->rmap->n / bs]; 1497 /* add 1 to i and j indices */ 1498 for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1; 1499 *ia = tia; 1500 if (ja) { 1501 for (i = 0; i < nz; i++) tja[i] = tja[i] + 1; 1502 *ja = tja; 1503 } 1504 } else { 1505 nz = a->i[A->rmap->n / bs]; 1506 /* malloc space and add 1 to i and j indices */ 1507 PetscCall(PetscMalloc1(A->rmap->n / bs + 1, ia)); 1508 for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1; 1509 if (ja) { 1510 PetscCall(PetscMalloc1(nz, ja)); 1511 for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1; 1512 } 1513 } 1514 } else { 1515 *ia = tia; 1516 if (ja) *ja = tja; 1517 } 1518 PetscFunctionReturn(0); 1519 } 1520 1521 static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 1522 PetscFunctionBegin; 1523 if (!ia) PetscFunctionReturn(0); 1524 if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) { 1525 PetscCall(PetscFree(*ia)); 1526 if (ja) PetscCall(PetscFree(*ja)); 1527 } 1528 PetscFunctionReturn(0); 1529 } 1530 1531 PetscErrorCode MatDestroy_SeqBAIJ(Mat A) { 1532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1533 1534 PetscFunctionBegin; 1535 #if defined(PETSC_USE_LOG) 1536 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz); 1537 #endif 1538 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 1539 PetscCall(ISDestroy(&a->row)); 1540 PetscCall(ISDestroy(&a->col)); 1541 if (a->free_diag) PetscCall(PetscFree(a->diag)); 1542 PetscCall(PetscFree(a->idiag)); 1543 if (a->free_imax_ilen) PetscCall(PetscFree2(a->imax, a->ilen)); 1544 PetscCall(PetscFree(a->solve_work)); 1545 PetscCall(PetscFree(a->mult_work)); 1546 PetscCall(PetscFree(a->sor_workt)); 1547 PetscCall(PetscFree(a->sor_work)); 1548 PetscCall(ISDestroy(&a->icol)); 1549 PetscCall(PetscFree(a->saved_values)); 1550 PetscCall(PetscFree2(a->compressedrow.i, a->compressedrow.rindex)); 1551 1552 PetscCall(MatDestroy(&a->sbaijMat)); 1553 PetscCall(MatDestroy(&a->parent)); 1554 PetscCall(PetscFree(A->data)); 1555 1556 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 1557 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL)); 1558 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL)); 1559 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 1560 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 1561 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL)); 1562 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL)); 1563 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL)); 1564 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL)); 1565 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL)); 1566 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL)); 1567 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL)); 1568 #if defined(PETSC_HAVE_HYPRE) 1569 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL)); 1570 #endif 1571 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL)); 1572 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 1573 PetscFunctionReturn(0); 1574 } 1575 1576 PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg) { 1577 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1578 1579 PetscFunctionBegin; 1580 switch (op) { 1581 case MAT_ROW_ORIENTED: a->roworiented = flg; break; 1582 case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break; 1583 case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break; 1584 case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break; 1585 case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break; 1586 case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break; 1587 case MAT_FORCE_DIAGONAL_ENTRIES: 1588 case MAT_IGNORE_OFF_PROC_ENTRIES: 1589 case MAT_USE_HASH_TABLE: 1590 case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break; 1591 case MAT_SPD: 1592 case MAT_SYMMETRIC: 1593 case MAT_STRUCTURALLY_SYMMETRIC: 1594 case MAT_HERMITIAN: 1595 case MAT_SYMMETRY_ETERNAL: 1596 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 1597 case MAT_SUBMAT_SINGLEIS: 1598 case MAT_STRUCTURE_ONLY: 1599 case MAT_SPD_ETERNAL: 1600 /* if the diagonal matrix is square it inherits some of the properties above */ 1601 break; 1602 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 1603 } 1604 PetscFunctionReturn(0); 1605 } 1606 1607 /* used for both SeqBAIJ and SeqSBAIJ matrices */ 1608 PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa) { 1609 PetscInt itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2; 1610 MatScalar *aa_i; 1611 PetscScalar *v_i; 1612 1613 PetscFunctionBegin; 1614 bs = A->rmap->bs; 1615 bs2 = bs * bs; 1616 PetscCheck(row >= 0 && row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 1617 1618 bn = row / bs; /* Block number */ 1619 bp = row % bs; /* Block Position */ 1620 M = ai[bn + 1] - ai[bn]; 1621 *nz = bs * M; 1622 1623 if (v) { 1624 *v = NULL; 1625 if (*nz) { 1626 PetscCall(PetscMalloc1(*nz, v)); 1627 for (i = 0; i < M; i++) { /* for each block in the block row */ 1628 v_i = *v + i * bs; 1629 aa_i = aa + bs2 * (ai[bn] + i); 1630 for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j]; 1631 } 1632 } 1633 } 1634 1635 if (idx) { 1636 *idx = NULL; 1637 if (*nz) { 1638 PetscCall(PetscMalloc1(*nz, idx)); 1639 for (i = 0; i < M; i++) { /* for each block in the block row */ 1640 idx_i = *idx + i * bs; 1641 itmp = bs * aj[ai[bn] + i]; 1642 for (j = 0; j < bs; j++) idx_i[j] = itmp++; 1643 } 1644 } 1645 } 1646 PetscFunctionReturn(0); 1647 } 1648 1649 PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 1650 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1651 1652 PetscFunctionBegin; 1653 PetscCall(MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a)); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 1658 PetscFunctionBegin; 1659 if (nz) *nz = 0; 1660 if (idx) PetscCall(PetscFree(*idx)); 1661 if (v) PetscCall(PetscFree(*v)); 1662 PetscFunctionReturn(0); 1663 } 1664 1665 PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B) { 1666 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at; 1667 Mat C; 1668 PetscInt i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill; 1669 PetscInt bs2 = a->bs2, *ati, *atj, anzj, kr; 1670 MatScalar *ata, *aa = a->a; 1671 1672 PetscFunctionBegin; 1673 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 1674 PetscCall(PetscCalloc1(1 + nbs, &atfill)); 1675 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 1676 for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */ 1677 1678 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1679 PetscCall(MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N)); 1680 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1681 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, atfill)); 1682 1683 at = (Mat_SeqBAIJ *)C->data; 1684 ati = at->i; 1685 for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i]; 1686 } else { 1687 C = *B; 1688 at = (Mat_SeqBAIJ *)C->data; 1689 ati = at->i; 1690 } 1691 1692 atj = at->j; 1693 ata = at->a; 1694 1695 /* Copy ati into atfill so we have locations of the next free space in atj */ 1696 PetscCall(PetscArraycpy(atfill, ati, nbs)); 1697 1698 /* Walk through A row-wise and mark nonzero entries of A^T. */ 1699 for (i = 0; i < mbs; i++) { 1700 anzj = ai[i + 1] - ai[i]; 1701 for (j = 0; j < anzj; j++) { 1702 atj[atfill[*aj]] = i; 1703 for (kr = 0; kr < bs; kr++) { 1704 for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++; 1705 } 1706 atfill[*aj++] += 1; 1707 } 1708 } 1709 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1710 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1711 1712 /* Clean up temporary space and complete requests. */ 1713 PetscCall(PetscFree(atfill)); 1714 1715 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) { 1716 PetscCall(MatSetBlockSizes(C, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs))); 1717 *B = C; 1718 } else { 1719 PetscCall(MatHeaderMerge(A, &C)); 1720 } 1721 PetscFunctionReturn(0); 1722 } 1723 1724 PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f) { 1725 Mat Btrans; 1726 1727 PetscFunctionBegin; 1728 *f = PETSC_FALSE; 1729 PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans)); 1730 PetscCall(MatEqual_SeqBAIJ(B, Btrans, f)); 1731 PetscCall(MatDestroy(&Btrans)); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 /* Used for both SeqBAIJ and SeqSBAIJ matrices */ 1736 PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer) { 1737 Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data; 1738 PetscInt header[4], M, N, m, bs, nz, cnt, i, j, k, l; 1739 PetscInt *rowlens, *colidxs; 1740 PetscScalar *matvals; 1741 1742 PetscFunctionBegin; 1743 PetscCall(PetscViewerSetUp(viewer)); 1744 1745 M = mat->rmap->N; 1746 N = mat->cmap->N; 1747 m = mat->rmap->n; 1748 bs = mat->rmap->bs; 1749 nz = bs * bs * A->nz; 1750 1751 /* write matrix header */ 1752 header[0] = MAT_FILE_CLASSID; 1753 header[1] = M; 1754 header[2] = N; 1755 header[3] = nz; 1756 PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT)); 1757 1758 /* store row lengths */ 1759 PetscCall(PetscMalloc1(m, &rowlens)); 1760 for (cnt = 0, i = 0; i < A->mbs; i++) 1761 for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]); 1762 PetscCall(PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT)); 1763 PetscCall(PetscFree(rowlens)); 1764 1765 /* store column indices */ 1766 PetscCall(PetscMalloc1(nz, &colidxs)); 1767 for (cnt = 0, i = 0; i < A->mbs; i++) 1768 for (k = 0; k < bs; k++) 1769 for (j = A->i[i]; j < A->i[i + 1]; j++) 1770 for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l; 1771 PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz); 1772 PetscCall(PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT)); 1773 PetscCall(PetscFree(colidxs)); 1774 1775 /* store nonzero values */ 1776 PetscCall(PetscMalloc1(nz, &matvals)); 1777 for (cnt = 0, i = 0; i < A->mbs; i++) 1778 for (k = 0; k < bs; k++) 1779 for (j = A->i[i]; j < A->i[i + 1]; j++) 1780 for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k]; 1781 PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz); 1782 PetscCall(PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR)); 1783 PetscCall(PetscFree(matvals)); 1784 1785 /* write block size option to the viewer's .info file */ 1786 PetscCall(MatView_Binary_BlockSizes(mat, viewer)); 1787 PetscFunctionReturn(0); 1788 } 1789 1790 static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer) { 1791 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1792 PetscInt i, bs = A->rmap->bs, k; 1793 1794 PetscFunctionBegin; 1795 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1796 for (i = 0; i < a->mbs; i++) { 1797 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1)); 1798 for (k = a->i[i]; k < a->i[i + 1]; k++) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1)); 1799 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1800 } 1801 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1802 PetscFunctionReturn(0); 1803 } 1804 1805 static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer) { 1806 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1807 PetscInt i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2; 1808 PetscViewerFormat format; 1809 1810 PetscFunctionBegin; 1811 if (A->structure_only) { 1812 PetscCall(MatView_SeqBAIJ_ASCII_structonly(A, viewer)); 1813 PetscFunctionReturn(0); 1814 } 1815 1816 PetscCall(PetscViewerGetFormat(viewer, &format)); 1817 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 1818 PetscCall(PetscViewerASCIIPrintf(viewer, " block size is %" PetscInt_FMT "\n", bs)); 1819 } else if (format == PETSC_VIEWER_ASCII_MATLAB) { 1820 const char *matname; 1821 Mat aij; 1822 PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij)); 1823 PetscCall(PetscObjectGetName((PetscObject)A, &matname)); 1824 PetscCall(PetscObjectSetName((PetscObject)aij, matname)); 1825 PetscCall(MatView(aij, viewer)); 1826 PetscCall(MatDestroy(&aij)); 1827 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) { 1828 PetscFunctionReturn(0); 1829 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1830 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1831 for (i = 0; i < a->mbs; i++) { 1832 for (j = 0; j < bs; j++) { 1833 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j)); 1834 for (k = a->i[i]; k < a->i[i + 1]; k++) { 1835 for (l = 0; l < bs; l++) { 1836 #if defined(PETSC_USE_COMPLEX) 1837 if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) { 1838 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]))); 1839 } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) { 1840 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]))); 1841 } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) { 1842 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]))); 1843 } 1844 #else 1845 if (a->a[bs2 * k + l * bs + j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j])); 1846 #endif 1847 } 1848 } 1849 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1850 } 1851 } 1852 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1853 } else { 1854 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1855 for (i = 0; i < a->mbs; i++) { 1856 for (j = 0; j < bs; j++) { 1857 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j)); 1858 for (k = a->i[i]; k < a->i[i + 1]; k++) { 1859 for (l = 0; l < bs; l++) { 1860 #if defined(PETSC_USE_COMPLEX) 1861 if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) { 1862 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]))); 1863 } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) { 1864 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]))); 1865 } else { 1866 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]))); 1867 } 1868 #else 1869 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j])); 1870 #endif 1871 } 1872 } 1873 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1874 } 1875 } 1876 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1877 } 1878 PetscCall(PetscViewerFlush(viewer)); 1879 PetscFunctionReturn(0); 1880 } 1881 1882 #include <petscdraw.h> 1883 static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa) { 1884 Mat A = (Mat)Aa; 1885 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1886 PetscInt row, i, j, k, l, mbs = a->mbs, color, bs = A->rmap->bs, bs2 = a->bs2; 1887 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1888 MatScalar *aa; 1889 PetscViewer viewer; 1890 PetscViewerFormat format; 1891 1892 PetscFunctionBegin; 1893 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1894 PetscCall(PetscViewerGetFormat(viewer, &format)); 1895 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1896 1897 /* loop over matrix elements drawing boxes */ 1898 1899 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1900 PetscDrawCollectiveBegin(draw); 1901 /* Blue for negative, Cyan for zero and Red for positive */ 1902 color = PETSC_DRAW_BLUE; 1903 for (i = 0, row = 0; i < mbs; i++, row += bs) { 1904 for (j = a->i[i]; j < a->i[i + 1]; j++) { 1905 y_l = A->rmap->N - row - 1.0; 1906 y_r = y_l + 1.0; 1907 x_l = a->j[j] * bs; 1908 x_r = x_l + 1.0; 1909 aa = a->a + j * bs2; 1910 for (k = 0; k < bs; k++) { 1911 for (l = 0; l < bs; l++) { 1912 if (PetscRealPart(*aa++) >= 0.) continue; 1913 PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color)); 1914 } 1915 } 1916 } 1917 } 1918 color = PETSC_DRAW_CYAN; 1919 for (i = 0, row = 0; i < mbs; i++, row += bs) { 1920 for (j = a->i[i]; j < a->i[i + 1]; j++) { 1921 y_l = A->rmap->N - row - 1.0; 1922 y_r = y_l + 1.0; 1923 x_l = a->j[j] * bs; 1924 x_r = x_l + 1.0; 1925 aa = a->a + j * bs2; 1926 for (k = 0; k < bs; k++) { 1927 for (l = 0; l < bs; l++) { 1928 if (PetscRealPart(*aa++) != 0.) continue; 1929 PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color)); 1930 } 1931 } 1932 } 1933 } 1934 color = PETSC_DRAW_RED; 1935 for (i = 0, row = 0; i < mbs; i++, row += bs) { 1936 for (j = a->i[i]; j < a->i[i + 1]; j++) { 1937 y_l = A->rmap->N - row - 1.0; 1938 y_r = y_l + 1.0; 1939 x_l = a->j[j] * bs; 1940 x_r = x_l + 1.0; 1941 aa = a->a + j * bs2; 1942 for (k = 0; k < bs; k++) { 1943 for (l = 0; l < bs; l++) { 1944 if (PetscRealPart(*aa++) <= 0.) continue; 1945 PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color)); 1946 } 1947 } 1948 } 1949 } 1950 PetscDrawCollectiveEnd(draw); 1951 } else { 1952 /* use contour shading to indicate magnitude of values */ 1953 /* first determine max of all nonzero values */ 1954 PetscReal minv = 0.0, maxv = 0.0; 1955 PetscDraw popup; 1956 1957 for (i = 0; i < a->nz * a->bs2; i++) { 1958 if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]); 1959 } 1960 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1961 PetscCall(PetscDrawGetPopup(draw, &popup)); 1962 PetscCall(PetscDrawScalePopup(popup, 0.0, maxv)); 1963 1964 PetscDrawCollectiveBegin(draw); 1965 for (i = 0, row = 0; i < mbs; i++, row += bs) { 1966 for (j = a->i[i]; j < a->i[i + 1]; j++) { 1967 y_l = A->rmap->N - row - 1.0; 1968 y_r = y_l + 1.0; 1969 x_l = a->j[j] * bs; 1970 x_r = x_l + 1.0; 1971 aa = a->a + j * bs2; 1972 for (k = 0; k < bs; k++) { 1973 for (l = 0; l < bs; l++) { 1974 MatScalar v = *aa++; 1975 color = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv); 1976 PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color)); 1977 } 1978 } 1979 } 1980 } 1981 PetscDrawCollectiveEnd(draw); 1982 } 1983 PetscFunctionReturn(0); 1984 } 1985 1986 static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer) { 1987 PetscReal xl, yl, xr, yr, w, h; 1988 PetscDraw draw; 1989 PetscBool isnull; 1990 1991 PetscFunctionBegin; 1992 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1993 PetscCall(PetscDrawIsNull(draw, &isnull)); 1994 if (isnull) PetscFunctionReturn(0); 1995 1996 xr = A->cmap->n; 1997 yr = A->rmap->N; 1998 h = yr / 10.0; 1999 w = xr / 10.0; 2000 xr += w; 2001 yr += h; 2002 xl = -w; 2003 yl = -h; 2004 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 2005 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 2006 PetscCall(PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A)); 2007 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 2008 PetscCall(PetscDrawSave(draw)); 2009 PetscFunctionReturn(0); 2010 } 2011 2012 PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer) { 2013 PetscBool iascii, isbinary, isdraw; 2014 2015 PetscFunctionBegin; 2016 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 2017 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 2018 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 2019 if (iascii) { 2020 PetscCall(MatView_SeqBAIJ_ASCII(A, viewer)); 2021 } else if (isbinary) { 2022 PetscCall(MatView_SeqBAIJ_Binary(A, viewer)); 2023 } else if (isdraw) { 2024 PetscCall(MatView_SeqBAIJ_Draw(A, viewer)); 2025 } else { 2026 Mat B; 2027 PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B)); 2028 PetscCall(MatView(B, viewer)); 2029 PetscCall(MatDestroy(&B)); 2030 } 2031 PetscFunctionReturn(0); 2032 } 2033 2034 PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) { 2035 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2036 PetscInt *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j; 2037 PetscInt *ai = a->i, *ailen = a->ilen; 2038 PetscInt brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2; 2039 MatScalar *ap, *aa = a->a; 2040 2041 PetscFunctionBegin; 2042 for (k = 0; k < m; k++) { /* loop over rows */ 2043 row = im[k]; 2044 brow = row / bs; 2045 if (row < 0) { 2046 v += n; 2047 continue; 2048 } /* negative row */ 2049 PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " too large", row); 2050 rp = aj ? aj + ai[brow] : NULL; /* mustn't add to NULL, that is UB */ 2051 ap = aa ? aa + bs2 * ai[brow] : NULL; /* mustn't add to NULL, that is UB */ 2052 nrow = ailen[brow]; 2053 for (l = 0; l < n; l++) { /* loop over columns */ 2054 if (in[l] < 0) { 2055 v++; 2056 continue; 2057 } /* negative column */ 2058 PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column %" PetscInt_FMT " too large", in[l]); 2059 col = in[l]; 2060 bcol = col / bs; 2061 cidx = col % bs; 2062 ridx = row % bs; 2063 high = nrow; 2064 low = 0; /* assume unsorted */ 2065 while (high - low > 5) { 2066 t = (low + high) / 2; 2067 if (rp[t] > bcol) high = t; 2068 else low = t; 2069 } 2070 for (i = low; i < high; i++) { 2071 if (rp[i] > bcol) break; 2072 if (rp[i] == bcol) { 2073 *v++ = ap[bs2 * i + bs * cidx + ridx]; 2074 goto finished; 2075 } 2076 } 2077 *v++ = 0.0; 2078 finished:; 2079 } 2080 } 2081 PetscFunctionReturn(0); 2082 } 2083 2084 PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) { 2085 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2086 PetscInt *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1; 2087 PetscInt *imax = a->imax, *ai = a->i, *ailen = a->ilen; 2088 PetscInt *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval; 2089 PetscBool roworiented = a->roworiented; 2090 const PetscScalar *value = v; 2091 MatScalar *ap = NULL, *aa = a->a, *bap; 2092 2093 PetscFunctionBegin; 2094 if (roworiented) { 2095 stepval = (n - 1) * bs; 2096 } else { 2097 stepval = (m - 1) * bs; 2098 } 2099 for (k = 0; k < m; k++) { /* loop over added rows */ 2100 row = im[k]; 2101 if (row < 0) continue; 2102 PetscCheck(row < a->mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block row index too large %" PetscInt_FMT " max %" PetscInt_FMT, row, a->mbs - 1); 2103 rp = aj + ai[row]; 2104 if (!A->structure_only) ap = aa + bs2 * ai[row]; 2105 rmax = imax[row]; 2106 nrow = ailen[row]; 2107 low = 0; 2108 high = nrow; 2109 for (l = 0; l < n; l++) { /* loop over added columns */ 2110 if (in[l] < 0) continue; 2111 PetscCheck(in[l] < a->nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block column index too large %" PetscInt_FMT " max %" PetscInt_FMT, in[l], a->nbs - 1); 2112 col = in[l]; 2113 if (!A->structure_only) { 2114 if (roworiented) { 2115 value = v + (k * (stepval + bs) + l) * bs; 2116 } else { 2117 value = v + (l * (stepval + bs) + k) * bs; 2118 } 2119 } 2120 if (col <= lastcol) low = 0; 2121 else high = nrow; 2122 lastcol = col; 2123 while (high - low > 7) { 2124 t = (low + high) / 2; 2125 if (rp[t] > col) high = t; 2126 else low = t; 2127 } 2128 for (i = low; i < high; i++) { 2129 if (rp[i] > col) break; 2130 if (rp[i] == col) { 2131 if (A->structure_only) goto noinsert2; 2132 bap = ap + bs2 * i; 2133 if (roworiented) { 2134 if (is == ADD_VALUES) { 2135 for (ii = 0; ii < bs; ii++, value += stepval) { 2136 for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++; 2137 } 2138 } else { 2139 for (ii = 0; ii < bs; ii++, value += stepval) { 2140 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++; 2141 } 2142 } 2143 } else { 2144 if (is == ADD_VALUES) { 2145 for (ii = 0; ii < bs; ii++, value += bs + stepval) { 2146 for (jj = 0; jj < bs; jj++) bap[jj] += value[jj]; 2147 bap += bs; 2148 } 2149 } else { 2150 for (ii = 0; ii < bs; ii++, value += bs + stepval) { 2151 for (jj = 0; jj < bs; jj++) bap[jj] = value[jj]; 2152 bap += bs; 2153 } 2154 } 2155 } 2156 goto noinsert2; 2157 } 2158 } 2159 if (nonew == 1) goto noinsert2; 2160 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new blocked index new nonzero block (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 2161 if (A->structure_only) { 2162 MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar); 2163 } else { 2164 MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar); 2165 } 2166 N = nrow++ - 1; 2167 high++; 2168 /* shift up all the later entries in this row */ 2169 PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1)); 2170 rp[i] = col; 2171 if (!A->structure_only) { 2172 PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1))); 2173 bap = ap + bs2 * i; 2174 if (roworiented) { 2175 for (ii = 0; ii < bs; ii++, value += stepval) { 2176 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++; 2177 } 2178 } else { 2179 for (ii = 0; ii < bs; ii++, value += stepval) { 2180 for (jj = 0; jj < bs; jj++) *bap++ = *value++; 2181 } 2182 } 2183 } 2184 noinsert2:; 2185 low = i; 2186 } 2187 ailen[row] = nrow; 2188 } 2189 PetscFunctionReturn(0); 2190 } 2191 2192 PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode) { 2193 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2194 PetscInt fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax; 2195 PetscInt m = A->rmap->N, *ip, N, *ailen = a->ilen; 2196 PetscInt mbs = a->mbs, bs2 = a->bs2, rmax = 0; 2197 MatScalar *aa = a->a, *ap; 2198 PetscReal ratio = 0.6; 2199 2200 PetscFunctionBegin; 2201 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 2202 2203 if (m) rmax = ailen[0]; 2204 for (i = 1; i < mbs; i++) { 2205 /* move each row back by the amount of empty slots (fshift) before it*/ 2206 fshift += imax[i - 1] - ailen[i - 1]; 2207 rmax = PetscMax(rmax, ailen[i]); 2208 if (fshift) { 2209 ip = aj + ai[i]; 2210 ap = aa + bs2 * ai[i]; 2211 N = ailen[i]; 2212 PetscCall(PetscArraymove(ip - fshift, ip, N)); 2213 if (!A->structure_only) PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2 * N)); 2214 } 2215 ai[i] = ai[i - 1] + ailen[i - 1]; 2216 } 2217 if (mbs) { 2218 fshift += imax[mbs - 1] - ailen[mbs - 1]; 2219 ai[mbs] = ai[mbs - 1] + ailen[mbs - 1]; 2220 } 2221 2222 /* reset ilen and imax for each row */ 2223 a->nonzerorowcnt = 0; 2224 if (A->structure_only) { 2225 PetscCall(PetscFree2(a->imax, a->ilen)); 2226 } else { /* !A->structure_only */ 2227 for (i = 0; i < mbs; i++) { 2228 ailen[i] = imax[i] = ai[i + 1] - ai[i]; 2229 a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0); 2230 } 2231 } 2232 a->nz = ai[mbs]; 2233 2234 /* diagonals may have moved, so kill the diagonal pointers */ 2235 a->idiagvalid = PETSC_FALSE; 2236 if (fshift && a->diag) { 2237 PetscCall(PetscFree(a->diag)); 2238 a->diag = NULL; 2239 } 2240 if (fshift) PetscCheck(a->nounused != -1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unused space detected in matrix: %" PetscInt_FMT " X %" PetscInt_FMT " block size %" PetscInt_FMT ", %" PetscInt_FMT " unneeded", m, A->cmap->n, A->rmap->bs, fshift * bs2); 2241 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2)); 2242 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs)); 2243 PetscCall(PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax)); 2244 2245 A->info.mallocs += a->reallocs; 2246 a->reallocs = 0; 2247 A->info.nz_unneeded = (PetscReal)fshift * bs2; 2248 a->rmax = rmax; 2249 2250 if (!A->structure_only) PetscCall(MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio)); 2251 PetscFunctionReturn(0); 2252 } 2253 2254 /* 2255 This function returns an array of flags which indicate the locations of contiguous 2256 blocks that should be zeroed. for eg: if bs = 3 and is = [0,1,2,3,5,6,7,8,9] 2257 then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)] 2258 Assume: sizes should be long enough to hold all the values. 2259 */ 2260 static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max) { 2261 PetscInt i, j, k, row; 2262 PetscBool flg; 2263 2264 PetscFunctionBegin; 2265 for (i = 0, j = 0; i < n; j++) { 2266 row = idx[i]; 2267 if (row % bs != 0) { /* Not the beginning of a block */ 2268 sizes[j] = 1; 2269 i++; 2270 } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */ 2271 sizes[j] = 1; /* Also makes sure at least 'bs' values exist for next else */ 2272 i++; 2273 } else { /* Beginning of the block, so check if the complete block exists */ 2274 flg = PETSC_TRUE; 2275 for (k = 1; k < bs; k++) { 2276 if (row + k != idx[i + k]) { /* break in the block */ 2277 flg = PETSC_FALSE; 2278 break; 2279 } 2280 } 2281 if (flg) { /* No break in the bs */ 2282 sizes[j] = bs; 2283 i += bs; 2284 } else { 2285 sizes[j] = 1; 2286 i++; 2287 } 2288 } 2289 } 2290 *bs_max = j; 2291 PetscFunctionReturn(0); 2292 } 2293 2294 PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b) { 2295 Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)A->data; 2296 PetscInt i, j, k, count, *rows; 2297 PetscInt bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max; 2298 PetscScalar zero = 0.0; 2299 MatScalar *aa; 2300 const PetscScalar *xx; 2301 PetscScalar *bb; 2302 2303 PetscFunctionBegin; 2304 /* fix right hand side if needed */ 2305 if (x && b) { 2306 PetscCall(VecGetArrayRead(x, &xx)); 2307 PetscCall(VecGetArray(b, &bb)); 2308 for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]]; 2309 PetscCall(VecRestoreArrayRead(x, &xx)); 2310 PetscCall(VecRestoreArray(b, &bb)); 2311 } 2312 2313 /* Make a copy of the IS and sort it */ 2314 /* allocate memory for rows,sizes */ 2315 PetscCall(PetscMalloc2(is_n, &rows, 2 * is_n, &sizes)); 2316 2317 /* copy IS values to rows, and sort them */ 2318 for (i = 0; i < is_n; i++) rows[i] = is_idx[i]; 2319 PetscCall(PetscSortInt(is_n, rows)); 2320 2321 if (baij->keepnonzeropattern) { 2322 for (i = 0; i < is_n; i++) sizes[i] = 1; 2323 bs_max = is_n; 2324 } else { 2325 PetscCall(MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max)); 2326 A->nonzerostate++; 2327 } 2328 2329 for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) { 2330 row = rows[j]; 2331 PetscCheck(row >= 0 && row <= A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", row); 2332 count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs; 2333 aa = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs); 2334 if (sizes[i] == bs && !baij->keepnonzeropattern) { 2335 if (diag != (PetscScalar)0.0) { 2336 if (baij->ilen[row / bs] > 0) { 2337 baij->ilen[row / bs] = 1; 2338 baij->j[baij->i[row / bs]] = row / bs; 2339 2340 PetscCall(PetscArrayzero(aa, count * bs)); 2341 } 2342 /* Now insert all the diagonal values for this bs */ 2343 for (k = 0; k < bs; k++) PetscCall((*A->ops->setvalues)(A, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES)); 2344 } else { /* (diag == 0.0) */ 2345 baij->ilen[row / bs] = 0; 2346 } /* end (diag == 0.0) */ 2347 } else { /* (sizes[i] != bs) */ 2348 PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1"); 2349 for (k = 0; k < count; k++) { 2350 aa[0] = zero; 2351 aa += bs; 2352 } 2353 if (diag != (PetscScalar)0.0) PetscCall((*A->ops->setvalues)(A, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES)); 2354 } 2355 } 2356 2357 PetscCall(PetscFree2(rows, sizes)); 2358 PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY)); 2359 PetscFunctionReturn(0); 2360 } 2361 2362 PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b) { 2363 Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)A->data; 2364 PetscInt i, j, k, count; 2365 PetscInt bs = A->rmap->bs, bs2 = baij->bs2, row, col; 2366 PetscScalar zero = 0.0; 2367 MatScalar *aa; 2368 const PetscScalar *xx; 2369 PetscScalar *bb; 2370 PetscBool *zeroed, vecs = PETSC_FALSE; 2371 2372 PetscFunctionBegin; 2373 /* fix right hand side if needed */ 2374 if (x && b) { 2375 PetscCall(VecGetArrayRead(x, &xx)); 2376 PetscCall(VecGetArray(b, &bb)); 2377 vecs = PETSC_TRUE; 2378 } 2379 2380 /* zero the columns */ 2381 PetscCall(PetscCalloc1(A->rmap->n, &zeroed)); 2382 for (i = 0; i < is_n; i++) { 2383 PetscCheck(is_idx[i] >= 0 && is_idx[i] < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", is_idx[i]); 2384 zeroed[is_idx[i]] = PETSC_TRUE; 2385 } 2386 for (i = 0; i < A->rmap->N; i++) { 2387 if (!zeroed[i]) { 2388 row = i / bs; 2389 for (j = baij->i[row]; j < baij->i[row + 1]; j++) { 2390 for (k = 0; k < bs; k++) { 2391 col = bs * baij->j[j] + k; 2392 if (zeroed[col]) { 2393 aa = ((MatScalar *)(baij->a)) + j * bs2 + (i % bs) + bs * k; 2394 if (vecs) bb[i] -= aa[0] * xx[col]; 2395 aa[0] = 0.0; 2396 } 2397 } 2398 } 2399 } else if (vecs) bb[i] = diag * xx[i]; 2400 } 2401 PetscCall(PetscFree(zeroed)); 2402 if (vecs) { 2403 PetscCall(VecRestoreArrayRead(x, &xx)); 2404 PetscCall(VecRestoreArray(b, &bb)); 2405 } 2406 2407 /* zero the rows */ 2408 for (i = 0; i < is_n; i++) { 2409 row = is_idx[i]; 2410 count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs; 2411 aa = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs); 2412 for (k = 0; k < count; k++) { 2413 aa[0] = zero; 2414 aa += bs; 2415 } 2416 if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES); 2417 } 2418 PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY)); 2419 PetscFunctionReturn(0); 2420 } 2421 2422 PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) { 2423 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2424 PetscInt *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1; 2425 PetscInt *imax = a->imax, *ai = a->i, *ailen = a->ilen; 2426 PetscInt *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol; 2427 PetscInt ridx, cidx, bs2 = a->bs2; 2428 PetscBool roworiented = a->roworiented; 2429 MatScalar *ap = NULL, value = 0.0, *aa = a->a, *bap; 2430 2431 PetscFunctionBegin; 2432 for (k = 0; k < m; k++) { /* loop over added rows */ 2433 row = im[k]; 2434 brow = row / bs; 2435 if (row < 0) continue; 2436 PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->N - 1); 2437 rp = aj + ai[brow]; 2438 if (!A->structure_only) ap = aa + bs2 * ai[brow]; 2439 rmax = imax[brow]; 2440 nrow = ailen[brow]; 2441 low = 0; 2442 high = nrow; 2443 for (l = 0; l < n; l++) { /* loop over added columns */ 2444 if (in[l] < 0) continue; 2445 PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[l], A->cmap->n - 1); 2446 col = in[l]; 2447 bcol = col / bs; 2448 ridx = row % bs; 2449 cidx = col % bs; 2450 if (!A->structure_only) { 2451 if (roworiented) { 2452 value = v[l + k * n]; 2453 } else { 2454 value = v[k + l * m]; 2455 } 2456 } 2457 if (col <= lastcol) low = 0; 2458 else high = nrow; 2459 lastcol = col; 2460 while (high - low > 7) { 2461 t = (low + high) / 2; 2462 if (rp[t] > bcol) high = t; 2463 else low = t; 2464 } 2465 for (i = low; i < high; i++) { 2466 if (rp[i] > bcol) break; 2467 if (rp[i] == bcol) { 2468 bap = ap + bs2 * i + bs * cidx + ridx; 2469 if (!A->structure_only) { 2470 if (is == ADD_VALUES) *bap += value; 2471 else *bap = value; 2472 } 2473 goto noinsert1; 2474 } 2475 } 2476 if (nonew == 1) goto noinsert1; 2477 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 2478 if (A->structure_only) { 2479 MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar); 2480 } else { 2481 MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar); 2482 } 2483 N = nrow++ - 1; 2484 high++; 2485 /* shift up all the later entries in this row */ 2486 PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1)); 2487 rp[i] = bcol; 2488 if (!A->structure_only) { 2489 PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1))); 2490 PetscCall(PetscArrayzero(ap + bs2 * i, bs2)); 2491 ap[bs2 * i + bs * cidx + ridx] = value; 2492 } 2493 a->nz++; 2494 A->nonzerostate++; 2495 noinsert1:; 2496 low = i; 2497 } 2498 ailen[brow] = nrow; 2499 } 2500 PetscFunctionReturn(0); 2501 } 2502 2503 PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info) { 2504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 2505 Mat outA; 2506 PetscBool row_identity, col_identity; 2507 2508 PetscFunctionBegin; 2509 PetscCheck(info->levels == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only levels = 0 supported for in-place ILU"); 2510 PetscCall(ISIdentity(row, &row_identity)); 2511 PetscCall(ISIdentity(col, &col_identity)); 2512 PetscCheck(row_identity && col_identity, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column permutations must be identity for in-place ILU"); 2513 2514 outA = inA; 2515 inA->factortype = MAT_FACTOR_LU; 2516 PetscCall(PetscFree(inA->solvertype)); 2517 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype)); 2518 2519 PetscCall(MatMarkDiagonal_SeqBAIJ(inA)); 2520 2521 PetscCall(PetscObjectReference((PetscObject)row)); 2522 PetscCall(ISDestroy(&a->row)); 2523 a->row = row; 2524 PetscCall(PetscObjectReference((PetscObject)col)); 2525 PetscCall(ISDestroy(&a->col)); 2526 a->col = col; 2527 2528 /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */ 2529 PetscCall(ISDestroy(&a->icol)); 2530 PetscCall(ISInvertPermutation(col, PETSC_DECIDE, &a->icol)); 2531 2532 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity))); 2533 if (!a->solve_work) { PetscCall(PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work)); } 2534 PetscCall(MatLUFactorNumeric(outA, inA, info)); 2535 PetscFunctionReturn(0); 2536 } 2537 2538 PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, PetscInt *indices) { 2539 Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data; 2540 PetscInt i, nz, mbs; 2541 2542 PetscFunctionBegin; 2543 nz = baij->maxnz; 2544 mbs = baij->mbs; 2545 for (i = 0; i < nz; i++) baij->j[i] = indices[i]; 2546 baij->nz = nz; 2547 for (i = 0; i < mbs; i++) baij->ilen[i] = baij->imax[i]; 2548 PetscFunctionReturn(0); 2549 } 2550 2551 /*@ 2552 MatSeqBAIJSetColumnIndices - Set the column indices for all the rows in the matrix. 2553 2554 Input Parameters: 2555 + mat - the `MATSEQBAIJ` matrix 2556 - indices - the column indices 2557 2558 Level: advanced 2559 2560 Notes: 2561 This can be called if you have precomputed the nonzero structure of the 2562 matrix and want to provide it to the matrix object to improve the performance 2563 of the `MatSetValues()` operation. 2564 2565 You MUST have set the correct numbers of nonzeros per row in the call to 2566 `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted. 2567 2568 MUST be called before any calls to `MatSetValues()` 2569 2570 .seealso: `MATSEQBAIJ`, `MatSetValues()` 2571 @*/ 2572 PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices) { 2573 PetscFunctionBegin; 2574 PetscValidHeaderSpecific(mat, MAT_CLASSID, 1); 2575 PetscValidIntPointer(indices, 2); 2576 PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, PetscInt *), (mat, indices)); 2577 PetscFunctionReturn(0); 2578 } 2579 2580 PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[]) { 2581 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2582 PetscInt i, j, n, row, bs, *ai, *aj, mbs; 2583 PetscReal atmp; 2584 PetscScalar *x, zero = 0.0; 2585 MatScalar *aa; 2586 PetscInt ncols, brow, krow, kcol; 2587 2588 PetscFunctionBegin; 2589 /* why is this not a macro???????????????????????????????????????????????????????????????? */ 2590 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2591 bs = A->rmap->bs; 2592 aa = a->a; 2593 ai = a->i; 2594 aj = a->j; 2595 mbs = a->mbs; 2596 2597 PetscCall(VecSet(v, zero)); 2598 PetscCall(VecGetArray(v, &x)); 2599 PetscCall(VecGetLocalSize(v, &n)); 2600 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 2601 for (i = 0; i < mbs; i++) { 2602 ncols = ai[1] - ai[0]; 2603 ai++; 2604 brow = bs * i; 2605 for (j = 0; j < ncols; j++) { 2606 for (kcol = 0; kcol < bs; kcol++) { 2607 for (krow = 0; krow < bs; krow++) { 2608 atmp = PetscAbsScalar(*aa); 2609 aa++; 2610 row = brow + krow; /* row index */ 2611 if (PetscAbsScalar(x[row]) < atmp) { 2612 x[row] = atmp; 2613 if (idx) idx[row] = bs * (*aj) + kcol; 2614 } 2615 } 2616 } 2617 aj++; 2618 } 2619 } 2620 PetscCall(VecRestoreArray(v, &x)); 2621 PetscFunctionReturn(0); 2622 } 2623 2624 PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str) { 2625 PetscFunctionBegin; 2626 /* If the two matrices have the same copy implementation, use fast copy. */ 2627 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 2628 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2629 Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)B->data; 2630 PetscInt ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs; 2631 2632 PetscCheck(a->i[ambs] == b->i[bmbs], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzero blocks in matrices A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", a->i[ambs], b->i[bmbs]); 2633 PetscCheck(abs == bbs, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Block size A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", abs, bbs); 2634 PetscCall(PetscArraycpy(b->a, a->a, bs2 * a->i[ambs])); 2635 PetscCall(PetscObjectStateIncrease((PetscObject)B)); 2636 } else { 2637 PetscCall(MatCopy_Basic(A, B, str)); 2638 } 2639 PetscFunctionReturn(0); 2640 } 2641 2642 PetscErrorCode MatSetUp_SeqBAIJ(Mat A) { 2643 PetscFunctionBegin; 2644 PetscCall(MatSeqBAIJSetPreallocation(A, A->rmap->bs, PETSC_DEFAULT, NULL)); 2645 PetscFunctionReturn(0); 2646 } 2647 2648 static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[]) { 2649 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2650 2651 PetscFunctionBegin; 2652 *array = a->a; 2653 PetscFunctionReturn(0); 2654 } 2655 2656 static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[]) { 2657 PetscFunctionBegin; 2658 *array = NULL; 2659 PetscFunctionReturn(0); 2660 } 2661 2662 PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz) { 2663 PetscInt bs = Y->rmap->bs, mbs = Y->rmap->N / bs; 2664 Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data; 2665 Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data; 2666 2667 PetscFunctionBegin; 2668 /* Set the number of nonzeros in the new matrix */ 2669 PetscCall(MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz)); 2670 PetscFunctionReturn(0); 2671 } 2672 2673 PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str) { 2674 Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data; 2675 PetscInt bs = Y->rmap->bs, bs2 = bs * bs; 2676 PetscBLASInt one = 1; 2677 2678 PetscFunctionBegin; 2679 if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) { 2680 PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE; 2681 if (e) { 2682 PetscCall(PetscArraycmp(x->i, y->i, x->mbs + 1, &e)); 2683 if (e) { 2684 PetscCall(PetscArraycmp(x->j, y->j, x->i[x->mbs], &e)); 2685 if (e) str = SAME_NONZERO_PATTERN; 2686 } 2687 } 2688 if (!e) PetscCheck(str != SAME_NONZERO_PATTERN, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "MatStructure is not SAME_NONZERO_PATTERN"); 2689 } 2690 if (str == SAME_NONZERO_PATTERN) { 2691 PetscScalar alpha = a; 2692 PetscBLASInt bnz; 2693 PetscCall(PetscBLASIntCast(x->nz * bs2, &bnz)); 2694 PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one)); 2695 PetscCall(PetscObjectStateIncrease((PetscObject)Y)); 2696 } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */ 2697 PetscCall(MatAXPY_Basic(Y, a, X, str)); 2698 } else { 2699 Mat B; 2700 PetscInt *nnz; 2701 PetscCheck(bs == X->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrices must have same block size"); 2702 PetscCall(PetscMalloc1(Y->rmap->N, &nnz)); 2703 PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B)); 2704 PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name)); 2705 PetscCall(MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N)); 2706 PetscCall(MatSetBlockSizesFromMats(B, Y, Y)); 2707 PetscCall(MatSetType(B, (MatType)((PetscObject)Y)->type_name)); 2708 PetscCall(MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz)); 2709 PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz)); 2710 PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str)); 2711 PetscCall(MatHeaderMerge(Y, &B)); 2712 PetscCall(PetscFree(nnz)); 2713 } 2714 PetscFunctionReturn(0); 2715 } 2716 2717 PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A) { 2718 #if defined(PETSC_USE_COMPLEX) 2719 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2720 PetscInt i, nz = a->bs2 * a->i[a->mbs]; 2721 MatScalar *aa = a->a; 2722 2723 PetscFunctionBegin; 2724 for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]); 2725 #else 2726 PetscFunctionBegin; 2727 #endif 2728 PetscFunctionReturn(0); 2729 } 2730 2731 PetscErrorCode MatRealPart_SeqBAIJ(Mat A) { 2732 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2733 PetscInt i, nz = a->bs2 * a->i[a->mbs]; 2734 MatScalar *aa = a->a; 2735 2736 PetscFunctionBegin; 2737 for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]); 2738 PetscFunctionReturn(0); 2739 } 2740 2741 PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A) { 2742 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2743 PetscInt i, nz = a->bs2 * a->i[a->mbs]; 2744 MatScalar *aa = a->a; 2745 2746 PetscFunctionBegin; 2747 for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]); 2748 PetscFunctionReturn(0); 2749 } 2750 2751 /* 2752 Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code 2753 */ 2754 PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 2755 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2756 PetscInt bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs; 2757 PetscInt nz = a->i[m], row, *jj, mr, col; 2758 2759 PetscFunctionBegin; 2760 *nn = n; 2761 if (!ia) PetscFunctionReturn(0); 2762 PetscCheck(!symmetric, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for BAIJ matrices"); 2763 PetscCall(PetscCalloc1(n, &collengths)); 2764 PetscCall(PetscMalloc1(n + 1, &cia)); 2765 PetscCall(PetscMalloc1(nz, &cja)); 2766 jj = a->j; 2767 for (i = 0; i < nz; i++) collengths[jj[i]]++; 2768 cia[0] = oshift; 2769 for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i]; 2770 PetscCall(PetscArrayzero(collengths, n)); 2771 jj = a->j; 2772 for (row = 0; row < m; row++) { 2773 mr = a->i[row + 1] - a->i[row]; 2774 for (i = 0; i < mr; i++) { 2775 col = *jj++; 2776 2777 cja[cia[col] + collengths[col]++ - oshift] = row + oshift; 2778 } 2779 } 2780 PetscCall(PetscFree(collengths)); 2781 *ia = cia; 2782 *ja = cja; 2783 PetscFunctionReturn(0); 2784 } 2785 2786 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 2787 PetscFunctionBegin; 2788 if (!ia) PetscFunctionReturn(0); 2789 PetscCall(PetscFree(*ia)); 2790 PetscCall(PetscFree(*ja)); 2791 PetscFunctionReturn(0); 2792 } 2793 2794 /* 2795 MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from 2796 MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output 2797 spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate() 2798 */ 2799 PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done) { 2800 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2801 PetscInt i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs; 2802 PetscInt nz = a->i[m], row, *jj, mr, col; 2803 PetscInt *cspidx; 2804 2805 PetscFunctionBegin; 2806 *nn = n; 2807 if (!ia) PetscFunctionReturn(0); 2808 2809 PetscCall(PetscCalloc1(n, &collengths)); 2810 PetscCall(PetscMalloc1(n + 1, &cia)); 2811 PetscCall(PetscMalloc1(nz, &cja)); 2812 PetscCall(PetscMalloc1(nz, &cspidx)); 2813 jj = a->j; 2814 for (i = 0; i < nz; i++) collengths[jj[i]]++; 2815 cia[0] = oshift; 2816 for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i]; 2817 PetscCall(PetscArrayzero(collengths, n)); 2818 jj = a->j; 2819 for (row = 0; row < m; row++) { 2820 mr = a->i[row + 1] - a->i[row]; 2821 for (i = 0; i < mr; i++) { 2822 col = *jj++; 2823 cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */ 2824 cja[cia[col] + collengths[col]++ - oshift] = row + oshift; 2825 } 2826 } 2827 PetscCall(PetscFree(collengths)); 2828 *ia = cia; 2829 *ja = cja; 2830 *spidx = cspidx; 2831 PetscFunctionReturn(0); 2832 } 2833 2834 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done) { 2835 PetscFunctionBegin; 2836 PetscCall(MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done)); 2837 PetscCall(PetscFree(*spidx)); 2838 PetscFunctionReturn(0); 2839 } 2840 2841 PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a) { 2842 Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data; 2843 2844 PetscFunctionBegin; 2845 if (!Y->preallocated || !aij->nz) PetscCall(MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL)); 2846 PetscCall(MatShift_Basic(Y, a)); 2847 PetscFunctionReturn(0); 2848 } 2849 2850 /* -------------------------------------------------------------------*/ 2851 static struct _MatOps MatOps_Values = { 2852 MatSetValues_SeqBAIJ, 2853 MatGetRow_SeqBAIJ, 2854 MatRestoreRow_SeqBAIJ, 2855 MatMult_SeqBAIJ_N, 2856 /* 4*/ MatMultAdd_SeqBAIJ_N, 2857 MatMultTranspose_SeqBAIJ, 2858 MatMultTransposeAdd_SeqBAIJ, 2859 NULL, 2860 NULL, 2861 NULL, 2862 /* 10*/ NULL, 2863 MatLUFactor_SeqBAIJ, 2864 NULL, 2865 NULL, 2866 MatTranspose_SeqBAIJ, 2867 /* 15*/ MatGetInfo_SeqBAIJ, 2868 MatEqual_SeqBAIJ, 2869 MatGetDiagonal_SeqBAIJ, 2870 MatDiagonalScale_SeqBAIJ, 2871 MatNorm_SeqBAIJ, 2872 /* 20*/ NULL, 2873 MatAssemblyEnd_SeqBAIJ, 2874 MatSetOption_SeqBAIJ, 2875 MatZeroEntries_SeqBAIJ, 2876 /* 24*/ MatZeroRows_SeqBAIJ, 2877 NULL, 2878 NULL, 2879 NULL, 2880 NULL, 2881 /* 29*/ MatSetUp_SeqBAIJ, 2882 NULL, 2883 NULL, 2884 NULL, 2885 NULL, 2886 /* 34*/ MatDuplicate_SeqBAIJ, 2887 NULL, 2888 NULL, 2889 MatILUFactor_SeqBAIJ, 2890 NULL, 2891 /* 39*/ MatAXPY_SeqBAIJ, 2892 MatCreateSubMatrices_SeqBAIJ, 2893 MatIncreaseOverlap_SeqBAIJ, 2894 MatGetValues_SeqBAIJ, 2895 MatCopy_SeqBAIJ, 2896 /* 44*/ NULL, 2897 MatScale_SeqBAIJ, 2898 MatShift_SeqBAIJ, 2899 NULL, 2900 MatZeroRowsColumns_SeqBAIJ, 2901 /* 49*/ NULL, 2902 MatGetRowIJ_SeqBAIJ, 2903 MatRestoreRowIJ_SeqBAIJ, 2904 MatGetColumnIJ_SeqBAIJ, 2905 MatRestoreColumnIJ_SeqBAIJ, 2906 /* 54*/ MatFDColoringCreate_SeqXAIJ, 2907 NULL, 2908 NULL, 2909 NULL, 2910 MatSetValuesBlocked_SeqBAIJ, 2911 /* 59*/ MatCreateSubMatrix_SeqBAIJ, 2912 MatDestroy_SeqBAIJ, 2913 MatView_SeqBAIJ, 2914 NULL, 2915 NULL, 2916 /* 64*/ NULL, 2917 NULL, 2918 NULL, 2919 NULL, 2920 NULL, 2921 /* 69*/ MatGetRowMaxAbs_SeqBAIJ, 2922 NULL, 2923 MatConvert_Basic, 2924 NULL, 2925 NULL, 2926 /* 74*/ NULL, 2927 MatFDColoringApply_BAIJ, 2928 NULL, 2929 NULL, 2930 NULL, 2931 /* 79*/ NULL, 2932 NULL, 2933 NULL, 2934 NULL, 2935 MatLoad_SeqBAIJ, 2936 /* 84*/ NULL, 2937 NULL, 2938 NULL, 2939 NULL, 2940 NULL, 2941 /* 89*/ NULL, 2942 NULL, 2943 NULL, 2944 NULL, 2945 NULL, 2946 /* 94*/ NULL, 2947 NULL, 2948 NULL, 2949 NULL, 2950 NULL, 2951 /* 99*/ NULL, 2952 NULL, 2953 NULL, 2954 MatConjugate_SeqBAIJ, 2955 NULL, 2956 /*104*/ NULL, 2957 MatRealPart_SeqBAIJ, 2958 MatImaginaryPart_SeqBAIJ, 2959 NULL, 2960 NULL, 2961 /*109*/ NULL, 2962 NULL, 2963 NULL, 2964 NULL, 2965 MatMissingDiagonal_SeqBAIJ, 2966 /*114*/ NULL, 2967 NULL, 2968 NULL, 2969 NULL, 2970 NULL, 2971 /*119*/ NULL, 2972 NULL, 2973 MatMultHermitianTranspose_SeqBAIJ, 2974 MatMultHermitianTransposeAdd_SeqBAIJ, 2975 NULL, 2976 /*124*/ NULL, 2977 MatGetColumnReductions_SeqBAIJ, 2978 MatInvertBlockDiagonal_SeqBAIJ, 2979 NULL, 2980 NULL, 2981 /*129*/ NULL, 2982 NULL, 2983 NULL, 2984 NULL, 2985 NULL, 2986 /*134*/ NULL, 2987 NULL, 2988 NULL, 2989 NULL, 2990 NULL, 2991 /*139*/ MatSetBlockSizes_Default, 2992 NULL, 2993 NULL, 2994 MatFDColoringSetUp_SeqXAIJ, 2995 NULL, 2996 /*144*/ MatCreateMPIMatConcatenateSeqMat_SeqBAIJ, 2997 MatDestroySubMatrices_SeqBAIJ, 2998 NULL, 2999 NULL, 3000 NULL, 3001 NULL, 3002 /*150*/ NULL, 3003 }; 3004 3005 PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat) { 3006 Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data; 3007 PetscInt nz = aij->i[aij->mbs] * aij->bs2; 3008 3009 PetscFunctionBegin; 3010 PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 3011 3012 /* allocate space for values if not already there */ 3013 if (!aij->saved_values) { PetscCall(PetscMalloc1(nz + 1, &aij->saved_values)); } 3014 3015 /* copy values over */ 3016 PetscCall(PetscArraycpy(aij->saved_values, aij->a, nz)); 3017 PetscFunctionReturn(0); 3018 } 3019 3020 PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat) { 3021 Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data; 3022 PetscInt nz = aij->i[aij->mbs] * aij->bs2; 3023 3024 PetscFunctionBegin; 3025 PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 3026 PetscCheck(aij->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 3027 3028 /* copy values over */ 3029 PetscCall(PetscArraycpy(aij->a, aij->saved_values, nz)); 3030 PetscFunctionReturn(0); 3031 } 3032 3033 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *); 3034 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *); 3035 3036 PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, PetscInt *nnz) { 3037 Mat_SeqBAIJ *b; 3038 PetscInt i, mbs, nbs, bs2; 3039 PetscBool flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 3040 3041 PetscFunctionBegin; 3042 if (nz >= 0 || nnz) realalloc = PETSC_TRUE; 3043 if (nz == MAT_SKIP_ALLOCATION) { 3044 skipallocation = PETSC_TRUE; 3045 nz = 0; 3046 } 3047 3048 PetscCall(MatSetBlockSize(B, PetscAbs(bs))); 3049 PetscCall(PetscLayoutSetUp(B->rmap)); 3050 PetscCall(PetscLayoutSetUp(B->cmap)); 3051 PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs)); 3052 3053 B->preallocated = PETSC_TRUE; 3054 3055 mbs = B->rmap->n / bs; 3056 nbs = B->cmap->n / bs; 3057 bs2 = bs * bs; 3058 3059 PetscCheck(mbs * bs == B->rmap->n && nbs * bs == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows %" PetscInt_FMT ", cols %" PetscInt_FMT " must be divisible by blocksize %" PetscInt_FMT, B->rmap->N, B->cmap->n, bs); 3060 3061 if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5; 3062 PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nz cannot be less than 0: value %" PetscInt_FMT, nz); 3063 if (nnz) { 3064 for (i = 0; i < mbs; i++) { 3065 PetscCheck(nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, nnz[i]); 3066 PetscCheck(nnz[i] <= nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be greater than block row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, nnz[i], nbs); 3067 } 3068 } 3069 3070 b = (Mat_SeqBAIJ *)B->data; 3071 PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat"); 3072 PetscCall(PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL)); 3073 PetscOptionsEnd(); 3074 3075 if (!flg) { 3076 switch (bs) { 3077 case 1: 3078 B->ops->mult = MatMult_SeqBAIJ_1; 3079 B->ops->multadd = MatMultAdd_SeqBAIJ_1; 3080 break; 3081 case 2: 3082 B->ops->mult = MatMult_SeqBAIJ_2; 3083 B->ops->multadd = MatMultAdd_SeqBAIJ_2; 3084 break; 3085 case 3: 3086 B->ops->mult = MatMult_SeqBAIJ_3; 3087 B->ops->multadd = MatMultAdd_SeqBAIJ_3; 3088 break; 3089 case 4: 3090 B->ops->mult = MatMult_SeqBAIJ_4; 3091 B->ops->multadd = MatMultAdd_SeqBAIJ_4; 3092 break; 3093 case 5: 3094 B->ops->mult = MatMult_SeqBAIJ_5; 3095 B->ops->multadd = MatMultAdd_SeqBAIJ_5; 3096 break; 3097 case 6: 3098 B->ops->mult = MatMult_SeqBAIJ_6; 3099 B->ops->multadd = MatMultAdd_SeqBAIJ_6; 3100 break; 3101 case 7: 3102 B->ops->mult = MatMult_SeqBAIJ_7; 3103 B->ops->multadd = MatMultAdd_SeqBAIJ_7; 3104 break; 3105 case 9: { 3106 PetscInt version = 1; 3107 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL)); 3108 switch (version) { 3109 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 3110 case 1: 3111 B->ops->mult = MatMult_SeqBAIJ_9_AVX2; 3112 B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2; 3113 PetscCall(PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs)); 3114 break; 3115 #endif 3116 default: 3117 B->ops->mult = MatMult_SeqBAIJ_N; 3118 B->ops->multadd = MatMultAdd_SeqBAIJ_N; 3119 PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs)); 3120 break; 3121 } 3122 break; 3123 } 3124 case 11: 3125 B->ops->mult = MatMult_SeqBAIJ_11; 3126 B->ops->multadd = MatMultAdd_SeqBAIJ_11; 3127 break; 3128 case 12: { 3129 PetscInt version = 1; 3130 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL)); 3131 switch (version) { 3132 case 1: 3133 B->ops->mult = MatMult_SeqBAIJ_12_ver1; 3134 B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1; 3135 PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs)); 3136 break; 3137 case 2: 3138 B->ops->mult = MatMult_SeqBAIJ_12_ver2; 3139 B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2; 3140 PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs)); 3141 break; 3142 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 3143 case 3: 3144 B->ops->mult = MatMult_SeqBAIJ_12_AVX2; 3145 B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1; 3146 PetscCall(PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs)); 3147 break; 3148 #endif 3149 default: 3150 B->ops->mult = MatMult_SeqBAIJ_N; 3151 B->ops->multadd = MatMultAdd_SeqBAIJ_N; 3152 PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs)); 3153 break; 3154 } 3155 break; 3156 } 3157 case 15: { 3158 PetscInt version = 1; 3159 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL)); 3160 switch (version) { 3161 case 1: 3162 B->ops->mult = MatMult_SeqBAIJ_15_ver1; 3163 PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs)); 3164 break; 3165 case 2: 3166 B->ops->mult = MatMult_SeqBAIJ_15_ver2; 3167 PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs)); 3168 break; 3169 case 3: 3170 B->ops->mult = MatMult_SeqBAIJ_15_ver3; 3171 PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs)); 3172 break; 3173 case 4: 3174 B->ops->mult = MatMult_SeqBAIJ_15_ver4; 3175 PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs)); 3176 break; 3177 default: 3178 B->ops->mult = MatMult_SeqBAIJ_N; 3179 PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs)); 3180 break; 3181 } 3182 B->ops->multadd = MatMultAdd_SeqBAIJ_N; 3183 break; 3184 } 3185 default: 3186 B->ops->mult = MatMult_SeqBAIJ_N; 3187 B->ops->multadd = MatMultAdd_SeqBAIJ_N; 3188 PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs)); 3189 break; 3190 } 3191 } 3192 B->ops->sor = MatSOR_SeqBAIJ; 3193 b->mbs = mbs; 3194 b->nbs = nbs; 3195 if (!skipallocation) { 3196 if (!b->imax) { 3197 PetscCall(PetscMalloc2(mbs, &b->imax, mbs, &b->ilen)); 3198 3199 b->free_imax_ilen = PETSC_TRUE; 3200 } 3201 /* b->ilen will count nonzeros in each block row so far. */ 3202 for (i = 0; i < mbs; i++) b->ilen[i] = 0; 3203 if (!nnz) { 3204 if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5; 3205 else if (nz < 0) nz = 1; 3206 nz = PetscMin(nz, nbs); 3207 for (i = 0; i < mbs; i++) b->imax[i] = nz; 3208 PetscCall(PetscIntMultError(nz, mbs, &nz)); 3209 } else { 3210 PetscInt64 nz64 = 0; 3211 for (i = 0; i < mbs; i++) { 3212 b->imax[i] = nnz[i]; 3213 nz64 += nnz[i]; 3214 } 3215 PetscCall(PetscIntCast(nz64, &nz)); 3216 } 3217 3218 /* allocate the matrix space */ 3219 PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i)); 3220 if (B->structure_only) { 3221 PetscCall(PetscMalloc1(nz, &b->j)); 3222 PetscCall(PetscMalloc1(B->rmap->N + 1, &b->i)); 3223 } else { 3224 PetscInt nzbs2 = 0; 3225 PetscCall(PetscIntMultError(nz, bs2, &nzbs2)); 3226 PetscCall(PetscMalloc3(nzbs2, &b->a, nz, &b->j, B->rmap->N + 1, &b->i)); 3227 PetscCall(PetscArrayzero(b->a, nz * bs2)); 3228 } 3229 PetscCall(PetscArrayzero(b->j, nz)); 3230 3231 if (B->structure_only) { 3232 b->singlemalloc = PETSC_FALSE; 3233 b->free_a = PETSC_FALSE; 3234 } else { 3235 b->singlemalloc = PETSC_TRUE; 3236 b->free_a = PETSC_TRUE; 3237 } 3238 b->free_ij = PETSC_TRUE; 3239 3240 b->i[0] = 0; 3241 for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1]; 3242 3243 } else { 3244 b->free_a = PETSC_FALSE; 3245 b->free_ij = PETSC_FALSE; 3246 } 3247 3248 b->bs2 = bs2; 3249 b->mbs = mbs; 3250 b->nz = 0; 3251 b->maxnz = nz; 3252 B->info.nz_unneeded = (PetscReal)b->maxnz * bs2; 3253 B->was_assembled = PETSC_FALSE; 3254 B->assembled = PETSC_FALSE; 3255 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 3256 PetscFunctionReturn(0); 3257 } 3258 3259 PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[]) { 3260 PetscInt i, m, nz, nz_max = 0, *nnz; 3261 PetscScalar *values = NULL; 3262 PetscBool roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented; 3263 3264 PetscFunctionBegin; 3265 PetscCheck(bs >= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Invalid block size specified, must be positive but it is %" PetscInt_FMT, bs); 3266 PetscCall(PetscLayoutSetBlockSize(B->rmap, bs)); 3267 PetscCall(PetscLayoutSetBlockSize(B->cmap, bs)); 3268 PetscCall(PetscLayoutSetUp(B->rmap)); 3269 PetscCall(PetscLayoutSetUp(B->cmap)); 3270 PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs)); 3271 m = B->rmap->n / bs; 3272 3273 PetscCheck(ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "ii[0] must be 0 but it is %" PetscInt_FMT, ii[0]); 3274 PetscCall(PetscMalloc1(m + 1, &nnz)); 3275 for (i = 0; i < m; i++) { 3276 nz = ii[i + 1] - ii[i]; 3277 PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative number of columns %" PetscInt_FMT, i, nz); 3278 nz_max = PetscMax(nz_max, nz); 3279 nnz[i] = nz; 3280 } 3281 PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz)); 3282 PetscCall(PetscFree(nnz)); 3283 3284 values = (PetscScalar *)V; 3285 if (!values) PetscCall(PetscCalloc1(bs * bs * (nz_max + 1), &values)); 3286 for (i = 0; i < m; i++) { 3287 PetscInt ncols = ii[i + 1] - ii[i]; 3288 const PetscInt *icols = jj + ii[i]; 3289 if (bs == 1 || !roworiented) { 3290 const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0); 3291 PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES)); 3292 } else { 3293 PetscInt j; 3294 for (j = 0; j < ncols; j++) { 3295 const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0); 3296 PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES)); 3297 } 3298 } 3299 } 3300 if (!V) PetscCall(PetscFree(values)); 3301 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 3302 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 3303 PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 3304 PetscFunctionReturn(0); 3305 } 3306 3307 /*@C 3308 MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored 3309 3310 Not Collective 3311 3312 Input Parameter: 3313 . mat - a `MATSEQBAIJ` matrix 3314 3315 Output Parameter: 3316 . array - pointer to the data 3317 3318 Level: intermediate 3319 3320 .seealso: `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()` 3321 @*/ 3322 PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar **array) { 3323 PetscFunctionBegin; 3324 PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array)); 3325 PetscFunctionReturn(0); 3326 } 3327 3328 /*@C 3329 MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()` 3330 3331 Not Collective 3332 3333 Input Parameters: 3334 + mat - a `MATSEQBAIJ` matrix 3335 - array - pointer to the data 3336 3337 Level: intermediate 3338 3339 .seealso: `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()` 3340 @*/ 3341 PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar **array) { 3342 PetscFunctionBegin; 3343 PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array)); 3344 PetscFunctionReturn(0); 3345 } 3346 3347 /*MC 3348 MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on 3349 block sparse compressed row format. 3350 3351 Options Database Keys: 3352 + -mat_type seqbaij - sets the matrix type to "seqbaij" during a call to MatSetFromOptions() 3353 - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS) 3354 3355 Level: beginner 3356 3357 Notes: 3358 `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no 3359 space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored 3360 3361 Run with -info to see what version of the matrix-vector product is being used 3362 3363 .seealso: `MatCreateSeqBAIJ()` 3364 M*/ 3365 3366 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *); 3367 3368 PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B) { 3369 PetscMPIInt size; 3370 Mat_SeqBAIJ *b; 3371 3372 PetscFunctionBegin; 3373 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 3374 PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Comm must be of size 1"); 3375 3376 PetscCall(PetscNew(&b)); 3377 B->data = (void *)b; 3378 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 3379 3380 b->row = NULL; 3381 b->col = NULL; 3382 b->icol = NULL; 3383 b->reallocs = 0; 3384 b->saved_values = NULL; 3385 3386 b->roworiented = PETSC_TRUE; 3387 b->nonew = 0; 3388 b->diag = NULL; 3389 B->spptr = NULL; 3390 B->info.nz_unneeded = (PetscReal)b->maxnz * b->bs2; 3391 b->keepnonzeropattern = PETSC_FALSE; 3392 3393 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ)); 3394 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ)); 3395 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ)); 3396 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ)); 3397 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ)); 3398 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ)); 3399 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ)); 3400 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ)); 3401 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ)); 3402 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ)); 3403 #if defined(PETSC_HAVE_HYPRE) 3404 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE)); 3405 #endif 3406 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS)); 3407 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ)); 3408 PetscFunctionReturn(0); 3409 } 3410 3411 PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) { 3412 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data; 3413 PetscInt i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2; 3414 3415 PetscFunctionBegin; 3416 PetscCheck(a->i[mbs] == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupt matrix"); 3417 3418 if (cpvalues == MAT_SHARE_NONZERO_PATTERN) { 3419 c->imax = a->imax; 3420 c->ilen = a->ilen; 3421 c->free_imax_ilen = PETSC_FALSE; 3422 } else { 3423 PetscCall(PetscMalloc2(mbs, &c->imax, mbs, &c->ilen)); 3424 for (i = 0; i < mbs; i++) { 3425 c->imax[i] = a->imax[i]; 3426 c->ilen[i] = a->ilen[i]; 3427 } 3428 c->free_imax_ilen = PETSC_TRUE; 3429 } 3430 3431 /* allocate the matrix space */ 3432 if (mallocmatspace) { 3433 if (cpvalues == MAT_SHARE_NONZERO_PATTERN) { 3434 PetscCall(PetscCalloc1(bs2 * nz, &c->a)); 3435 3436 c->i = a->i; 3437 c->j = a->j; 3438 c->singlemalloc = PETSC_FALSE; 3439 c->free_a = PETSC_TRUE; 3440 c->free_ij = PETSC_FALSE; 3441 c->parent = A; 3442 C->preallocated = PETSC_TRUE; 3443 C->assembled = PETSC_TRUE; 3444 3445 PetscCall(PetscObjectReference((PetscObject)A)); 3446 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 3447 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 3448 } else { 3449 PetscCall(PetscMalloc3(bs2 * nz, &c->a, nz, &c->j, mbs + 1, &c->i)); 3450 3451 c->singlemalloc = PETSC_TRUE; 3452 c->free_a = PETSC_TRUE; 3453 c->free_ij = PETSC_TRUE; 3454 3455 PetscCall(PetscArraycpy(c->i, a->i, mbs + 1)); 3456 if (mbs > 0) { 3457 PetscCall(PetscArraycpy(c->j, a->j, nz)); 3458 if (cpvalues == MAT_COPY_VALUES) { 3459 PetscCall(PetscArraycpy(c->a, a->a, bs2 * nz)); 3460 } else { 3461 PetscCall(PetscArrayzero(c->a, bs2 * nz)); 3462 } 3463 } 3464 C->preallocated = PETSC_TRUE; 3465 C->assembled = PETSC_TRUE; 3466 } 3467 } 3468 3469 c->roworiented = a->roworiented; 3470 c->nonew = a->nonew; 3471 3472 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 3473 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 3474 3475 c->bs2 = a->bs2; 3476 c->mbs = a->mbs; 3477 c->nbs = a->nbs; 3478 3479 if (a->diag) { 3480 if (cpvalues == MAT_SHARE_NONZERO_PATTERN) { 3481 c->diag = a->diag; 3482 c->free_diag = PETSC_FALSE; 3483 } else { 3484 PetscCall(PetscMalloc1(mbs + 1, &c->diag)); 3485 for (i = 0; i < mbs; i++) c->diag[i] = a->diag[i]; 3486 c->free_diag = PETSC_TRUE; 3487 } 3488 } else c->diag = NULL; 3489 3490 c->nz = a->nz; 3491 c->maxnz = a->nz; /* Since we allocate exactly the right amount */ 3492 c->solve_work = NULL; 3493 c->mult_work = NULL; 3494 c->sor_workt = NULL; 3495 c->sor_work = NULL; 3496 3497 c->compressedrow.use = a->compressedrow.use; 3498 c->compressedrow.nrows = a->compressedrow.nrows; 3499 if (a->compressedrow.use) { 3500 i = a->compressedrow.nrows; 3501 PetscCall(PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex)); 3502 PetscCall(PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1)); 3503 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i)); 3504 } else { 3505 c->compressedrow.use = PETSC_FALSE; 3506 c->compressedrow.i = NULL; 3507 c->compressedrow.rindex = NULL; 3508 } 3509 C->nonzerostate = A->nonzerostate; 3510 3511 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 3512 PetscFunctionReturn(0); 3513 } 3514 3515 PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B) { 3516 PetscFunctionBegin; 3517 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 3518 PetscCall(MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n)); 3519 PetscCall(MatSetType(*B, MATSEQBAIJ)); 3520 PetscCall(MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE)); 3521 PetscFunctionReturn(0); 3522 } 3523 3524 /* Used for both SeqBAIJ and SeqSBAIJ matrices */ 3525 PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer) { 3526 PetscInt header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k; 3527 PetscInt *rowidxs, *colidxs; 3528 PetscScalar *matvals; 3529 3530 PetscFunctionBegin; 3531 PetscCall(PetscViewerSetUp(viewer)); 3532 3533 /* read matrix header */ 3534 PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT)); 3535 PetscCheck(header[0] == MAT_FILE_CLASSID, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file"); 3536 M = header[1]; 3537 N = header[2]; 3538 nz = header[3]; 3539 PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M); 3540 PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N); 3541 PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as SeqBAIJ"); 3542 3543 /* set block sizes from the viewer's .info file */ 3544 PetscCall(MatLoad_Binary_BlockSizes(mat, viewer)); 3545 /* set local and global sizes if not set already */ 3546 if (mat->rmap->n < 0) mat->rmap->n = M; 3547 if (mat->cmap->n < 0) mat->cmap->n = N; 3548 if (mat->rmap->N < 0) mat->rmap->N = M; 3549 if (mat->cmap->N < 0) mat->cmap->N = N; 3550 PetscCall(PetscLayoutSetUp(mat->rmap)); 3551 PetscCall(PetscLayoutSetUp(mat->cmap)); 3552 3553 /* check if the matrix sizes are correct */ 3554 PetscCall(MatGetSize(mat, &rows, &cols)); 3555 PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols); 3556 PetscCall(MatGetBlockSize(mat, &bs)); 3557 PetscCall(MatGetLocalSize(mat, &m, &n)); 3558 mbs = m / bs; 3559 nbs = n / bs; 3560 3561 /* read in row lengths, column indices and nonzero values */ 3562 PetscCall(PetscMalloc1(m + 1, &rowidxs)); 3563 PetscCall(PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT)); 3564 rowidxs[0] = 0; 3565 for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i]; 3566 sum = rowidxs[m]; 3567 PetscCheck(sum == nz, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum); 3568 3569 /* read in column indices and nonzero values */ 3570 PetscCall(PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals)); 3571 PetscCall(PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT)); 3572 PetscCall(PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR)); 3573 3574 { /* preallocate matrix storage */ 3575 PetscBT bt; /* helper bit set to count nonzeros */ 3576 PetscInt *nnz; 3577 PetscBool sbaij; 3578 3579 PetscCall(PetscBTCreate(nbs, &bt)); 3580 PetscCall(PetscCalloc1(mbs, &nnz)); 3581 PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij)); 3582 for (i = 0; i < mbs; i++) { 3583 PetscCall(PetscBTMemzero(nbs, bt)); 3584 for (k = 0; k < bs; k++) { 3585 PetscInt row = bs * i + k; 3586 for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) { 3587 PetscInt col = colidxs[j]; 3588 if (!sbaij || col >= row) 3589 if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++; 3590 } 3591 } 3592 } 3593 PetscCall(PetscBTDestroy(&bt)); 3594 PetscCall(MatSeqBAIJSetPreallocation(mat, bs, 0, nnz)); 3595 PetscCall(MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz)); 3596 PetscCall(PetscFree(nnz)); 3597 } 3598 3599 /* store matrix values */ 3600 for (i = 0; i < m; i++) { 3601 PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1]; 3602 PetscCall((*mat->ops->setvalues)(mat, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES)); 3603 } 3604 3605 PetscCall(PetscFree(rowidxs)); 3606 PetscCall(PetscFree2(colidxs, matvals)); 3607 PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY)); 3608 PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY)); 3609 PetscFunctionReturn(0); 3610 } 3611 3612 PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer) { 3613 PetscBool isbinary; 3614 3615 PetscFunctionBegin; 3616 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 3617 PetscCheck(isbinary, PetscObjectComm((PetscObject)viewer), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)mat)->type_name); 3618 PetscCall(MatLoad_SeqBAIJ_Binary(mat, viewer)); 3619 PetscFunctionReturn(0); 3620 } 3621 3622 /*@C 3623 MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block 3624 compressed row) format. For good matrix assembly performance the 3625 user should preallocate the matrix storage by setting the parameter nz 3626 (or the array nnz). By setting these parameters accurately, performance 3627 during matrix assembly can be increased by more than a factor of 50. 3628 3629 Collective 3630 3631 Input Parameters: 3632 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3633 . bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row 3634 blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()` 3635 . m - number of rows 3636 . n - number of columns 3637 . nz - number of nonzero blocks per block row (same for all rows) 3638 - nnz - array containing the number of nonzero blocks in the various block rows 3639 (possibly different for each block row) or NULL 3640 3641 Output Parameter: 3642 . A - the matrix 3643 3644 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3645 MatXXXXSetPreallocation() paradigm instead of this routine directly. 3646 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3647 3648 Options Database Keys: 3649 + -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower) 3650 - -mat_block_size - size of the blocks to use 3651 3652 Level: intermediate 3653 3654 Notes: 3655 The number of rows and columns must be divisible by blocksize. 3656 3657 If the nnz parameter is given then the nz parameter is ignored 3658 3659 A nonzero block is any block that as 1 or more nonzeros in it 3660 3661 The `MATSEQBAIJ` format is fully compatible with standard Fortran 77 3662 storage. That is, the stored row and column indices can begin at 3663 either one (as in Fortran) or zero. See the users' manual for details. 3664 3665 Specify the preallocated storage with either nz or nnz (not both). 3666 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3667 allocation. See [Sparse Matrices](sec_matsparse) for details. 3668 matrices. 3669 3670 .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()` 3671 @*/ 3672 PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 3673 PetscFunctionBegin; 3674 PetscCall(MatCreate(comm, A)); 3675 PetscCall(MatSetSizes(*A, m, n, m, n)); 3676 PetscCall(MatSetType(*A, MATSEQBAIJ)); 3677 PetscCall(MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz)); 3678 PetscFunctionReturn(0); 3679 } 3680 3681 /*@C 3682 MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros 3683 per row in the matrix. For good matrix assembly performance the 3684 user should preallocate the matrix storage by setting the parameter nz 3685 (or the array nnz). By setting these parameters accurately, performance 3686 during matrix assembly can be increased by more than a factor of 50. 3687 3688 Collective 3689 3690 Input Parameters: 3691 + B - the matrix 3692 . bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row 3693 blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()` 3694 . nz - number of block nonzeros per block row (same for all rows) 3695 - nnz - array containing the number of block nonzeros in the various block rows 3696 (possibly different for each block row) or NULL 3697 3698 Options Database Keys: 3699 + -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower) 3700 - -mat_block_size - size of the blocks to use 3701 3702 Level: intermediate 3703 3704 Notes: 3705 If the nnz parameter is given then the nz parameter is ignored 3706 3707 You can call `MatGetInfo()` to get information on how effective the preallocation was; 3708 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 3709 You can also run with the option -info and look for messages with the string 3710 malloc in them to see if additional memory allocation was needed. 3711 3712 The `MATSEQBAIJ` format is fully compatible with standard Fortran 77 3713 storage. That is, the stored row and column indices can begin at 3714 either one (as in Fortran) or zero. See the users' manual for details. 3715 3716 Specify the preallocated storage with either nz or nnz (not both). 3717 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3718 allocation. See [Sparse Matrices](sec_matsparse) for details. 3719 3720 .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()` 3721 @*/ 3722 PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[]) { 3723 PetscFunctionBegin; 3724 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 3725 PetscValidType(B, 1); 3726 PetscValidLogicalCollectiveInt(B, bs, 2); 3727 PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz)); 3728 PetscFunctionReturn(0); 3729 } 3730 3731 /*@C 3732 MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values 3733 3734 Collective 3735 3736 Input Parameters: 3737 + B - the matrix 3738 . i - the indices into j for the start of each local row (starts with zero) 3739 . j - the column indices for each local row (starts with zero) these must be sorted for each row 3740 - v - optional values in the matrix 3741 3742 Level: advanced 3743 3744 Notes: 3745 The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`. For example, C programs 3746 may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is 3747 over rows within a block and the last index is over columns within a block row. Fortran programs will likely set 3748 `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a 3749 block column and the second index is over columns within a block. 3750 3751 Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well 3752 3753 .seealso: `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ` 3754 @*/ 3755 PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[]) { 3756 PetscFunctionBegin; 3757 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 3758 PetscValidType(B, 1); 3759 PetscValidLogicalCollectiveInt(B, bs, 2); 3760 PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v)); 3761 PetscFunctionReturn(0); 3762 } 3763 3764 /*@ 3765 MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user. 3766 3767 Collective 3768 3769 Input Parameters: 3770 + comm - must be an MPI communicator of size 1 3771 . bs - size of block 3772 . m - number of rows 3773 . n - number of columns 3774 . i - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix 3775 . j - column indices 3776 - a - matrix values 3777 3778 Output Parameter: 3779 . mat - the matrix 3780 3781 Level: advanced 3782 3783 Notes: 3784 The i, j, and a arrays are not copied by this routine, the user must free these arrays 3785 once the matrix is destroyed 3786 3787 You cannot set new nonzero locations into this matrix, that will generate an error. 3788 3789 The i and j indices are 0 based 3790 3791 When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format 3792 3793 The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is 3794 the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first 3795 block, followed by the second column of the first block etc etc. That is, the blocks are contiguous in memory 3796 with column-major ordering within blocks. 3797 3798 .seealso: `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()` 3799 @*/ 3800 PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat) { 3801 PetscInt ii; 3802 Mat_SeqBAIJ *baij; 3803 3804 PetscFunctionBegin; 3805 PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "block size %" PetscInt_FMT " > 1 is not supported yet", bs); 3806 if (m > 0) PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 3807 3808 PetscCall(MatCreate(comm, mat)); 3809 PetscCall(MatSetSizes(*mat, m, n, m, n)); 3810 PetscCall(MatSetType(*mat, MATSEQBAIJ)); 3811 PetscCall(MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL)); 3812 baij = (Mat_SeqBAIJ *)(*mat)->data; 3813 PetscCall(PetscMalloc2(m, &baij->imax, m, &baij->ilen)); 3814 3815 baij->i = i; 3816 baij->j = j; 3817 baij->a = a; 3818 3819 baij->singlemalloc = PETSC_FALSE; 3820 baij->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 3821 baij->free_a = PETSC_FALSE; 3822 baij->free_ij = PETSC_FALSE; 3823 3824 for (ii = 0; ii < m; ii++) { 3825 baij->ilen[ii] = baij->imax[ii] = i[ii + 1] - i[ii]; 3826 PetscCheck(i[ii + 1] - i[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative row length in i (row indices) row = %" PetscInt_FMT " length = %" PetscInt_FMT, ii, i[ii + 1] - i[ii]); 3827 } 3828 if (PetscDefined(USE_DEBUG)) { 3829 for (ii = 0; ii < baij->i[m]; ii++) { 3830 PetscCheck(j[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative column index at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]); 3831 PetscCheck(j[ii] <= n - 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index to large at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]); 3832 } 3833 } 3834 3835 PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY)); 3836 PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY)); 3837 PetscFunctionReturn(0); 3838 } 3839 3840 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat) { 3841 PetscFunctionBegin; 3842 PetscCall(MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat)); 3843 PetscFunctionReturn(0); 3844 } 3845