1 #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h> 2 #include <petsc/private/matimpl.h> 3 4 static PetscErrorCode PCApply_PBJacobi(PC pc, Vec x, Vec y) 5 { 6 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 7 PetscInt i, ib, jb; 8 const PetscInt m = jac->mbs; 9 const PetscInt bs = jac->bs; 10 const MatScalar *diag = jac->diag; 11 PetscScalar *yy, x0, x1, x2, x3, x4, x5, x6; 12 const PetscScalar *xx; 13 14 PetscFunctionBegin; 15 PetscCall(VecGetArrayRead(x, &xx)); 16 PetscCall(VecGetArray(y, &yy)); 17 switch (bs) { 18 case 1: 19 for (i = 0; i < m; i++) yy[i] = diag[i] * xx[i]; 20 break; 21 case 2: 22 for (i = 0; i < m; i++) { 23 x0 = xx[2 * i]; 24 x1 = xx[2 * i + 1]; 25 yy[2 * i] = diag[0] * x0 + diag[2] * x1; 26 yy[2 * i + 1] = diag[1] * x0 + diag[3] * x1; 27 diag += 4; 28 } 29 break; 30 case 3: 31 for (i = 0; i < m; i++) { 32 x0 = xx[3 * i]; 33 x1 = xx[3 * i + 1]; 34 x2 = xx[3 * i + 2]; 35 36 yy[3 * i] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 37 yy[3 * i + 1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 38 yy[3 * i + 2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 39 diag += 9; 40 } 41 break; 42 case 4: 43 for (i = 0; i < m; i++) { 44 x0 = xx[4 * i]; 45 x1 = xx[4 * i + 1]; 46 x2 = xx[4 * i + 2]; 47 x3 = xx[4 * i + 3]; 48 49 yy[4 * i] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 50 yy[4 * i + 1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 51 yy[4 * i + 2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 52 yy[4 * i + 3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 53 diag += 16; 54 } 55 break; 56 case 5: 57 for (i = 0; i < m; i++) { 58 x0 = xx[5 * i]; 59 x1 = xx[5 * i + 1]; 60 x2 = xx[5 * i + 2]; 61 x3 = xx[5 * i + 3]; 62 x4 = xx[5 * i + 4]; 63 64 yy[5 * i] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 65 yy[5 * i + 1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 66 yy[5 * i + 2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 67 yy[5 * i + 3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 68 yy[5 * i + 4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 69 diag += 25; 70 } 71 break; 72 case 6: 73 for (i = 0; i < m; i++) { 74 x0 = xx[6 * i]; 75 x1 = xx[6 * i + 1]; 76 x2 = xx[6 * i + 2]; 77 x3 = xx[6 * i + 3]; 78 x4 = xx[6 * i + 4]; 79 x5 = xx[6 * i + 5]; 80 81 yy[6 * i] = diag[0] * x0 + diag[6] * x1 + diag[12] * x2 + diag[18] * x3 + diag[24] * x4 + diag[30] * x5; 82 yy[6 * i + 1] = diag[1] * x0 + diag[7] * x1 + diag[13] * x2 + diag[19] * x3 + diag[25] * x4 + diag[31] * x5; 83 yy[6 * i + 2] = diag[2] * x0 + diag[8] * x1 + diag[14] * x2 + diag[20] * x3 + diag[26] * x4 + diag[32] * x5; 84 yy[6 * i + 3] = diag[3] * x0 + diag[9] * x1 + diag[15] * x2 + diag[21] * x3 + diag[27] * x4 + diag[33] * x5; 85 yy[6 * i + 4] = diag[4] * x0 + diag[10] * x1 + diag[16] * x2 + diag[22] * x3 + diag[28] * x4 + diag[34] * x5; 86 yy[6 * i + 5] = diag[5] * x0 + diag[11] * x1 + diag[17] * x2 + diag[23] * x3 + diag[29] * x4 + diag[35] * x5; 87 diag += 36; 88 } 89 break; 90 case 7: 91 for (i = 0; i < m; i++) { 92 x0 = xx[7 * i]; 93 x1 = xx[7 * i + 1]; 94 x2 = xx[7 * i + 2]; 95 x3 = xx[7 * i + 3]; 96 x4 = xx[7 * i + 4]; 97 x5 = xx[7 * i + 5]; 98 x6 = xx[7 * i + 6]; 99 100 yy[7 * i] = diag[0] * x0 + diag[7] * x1 + diag[14] * x2 + diag[21] * x3 + diag[28] * x4 + diag[35] * x5 + diag[42] * x6; 101 yy[7 * i + 1] = diag[1] * x0 + diag[8] * x1 + diag[15] * x2 + diag[22] * x3 + diag[29] * x4 + diag[36] * x5 + diag[43] * x6; 102 yy[7 * i + 2] = diag[2] * x0 + diag[9] * x1 + diag[16] * x2 + diag[23] * x3 + diag[30] * x4 + diag[37] * x5 + diag[44] * x6; 103 yy[7 * i + 3] = diag[3] * x0 + diag[10] * x1 + diag[17] * x2 + diag[24] * x3 + diag[31] * x4 + diag[38] * x5 + diag[45] * x6; 104 yy[7 * i + 4] = diag[4] * x0 + diag[11] * x1 + diag[18] * x2 + diag[25] * x3 + diag[32] * x4 + diag[39] * x5 + diag[46] * x6; 105 yy[7 * i + 5] = diag[5] * x0 + diag[12] * x1 + diag[19] * x2 + diag[26] * x3 + diag[33] * x4 + diag[40] * x5 + diag[47] * x6; 106 yy[7 * i + 6] = diag[6] * x0 + diag[13] * x1 + diag[20] * x2 + diag[27] * x3 + diag[34] * x4 + diag[41] * x5 + diag[48] * x6; 107 diag += 49; 108 } 109 break; 110 default: 111 for (i = 0; i < m; i++) { 112 for (ib = 0; ib < bs; ib++) { 113 PetscScalar rowsum = 0; 114 for (jb = 0; jb < bs; jb++) rowsum += diag[ib + jb * bs] * xx[bs * i + jb]; 115 yy[bs * i + ib] = rowsum; 116 } 117 diag += bs * bs; 118 } 119 } 120 PetscCall(VecRestoreArrayRead(x, &xx)); 121 PetscCall(VecRestoreArray(y, &yy)); 122 PetscCall(PetscLogFlops((2.0 * bs * bs - bs) * m)); /* 2*bs2 - bs */ 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 126 static PetscErrorCode PCApplyTranspose_PBJacobi(PC pc, Vec x, Vec y) 127 { 128 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 129 PetscInt i, ib, jb; 130 const PetscInt m = jac->mbs; 131 const PetscInt bs = jac->bs; 132 const MatScalar *diag = jac->diag; 133 PetscScalar *yy, x0, x1, x2, x3, x4, x5, x6; 134 const PetscScalar *xx; 135 136 PetscFunctionBegin; 137 PetscCall(VecGetArrayRead(x, &xx)); 138 PetscCall(VecGetArray(y, &yy)); 139 switch (bs) { 140 case 1: 141 for (i = 0; i < m; i++) yy[i] = diag[i] * xx[i]; 142 break; 143 case 2: 144 for (i = 0; i < m; i++) { 145 x0 = xx[2 * i]; 146 x1 = xx[2 * i + 1]; 147 yy[2 * i] = diag[0] * x0 + diag[1] * x1; 148 yy[2 * i + 1] = diag[2] * x0 + diag[3] * x1; 149 diag += 4; 150 } 151 break; 152 case 3: 153 for (i = 0; i < m; i++) { 154 x0 = xx[3 * i]; 155 x1 = xx[3 * i + 1]; 156 x2 = xx[3 * i + 2]; 157 158 yy[3 * i] = diag[0] * x0 + diag[1] * x1 + diag[2] * x2; 159 yy[3 * i + 1] = diag[3] * x0 + diag[4] * x1 + diag[5] * x2; 160 yy[3 * i + 2] = diag[6] * x0 + diag[7] * x1 + diag[8] * x2; 161 diag += 9; 162 } 163 break; 164 case 4: 165 for (i = 0; i < m; i++) { 166 x0 = xx[4 * i]; 167 x1 = xx[4 * i + 1]; 168 x2 = xx[4 * i + 2]; 169 x3 = xx[4 * i + 3]; 170 171 yy[4 * i] = diag[0] * x0 + diag[1] * x1 + diag[2] * x2 + diag[3] * x3; 172 yy[4 * i + 1] = diag[4] * x0 + diag[5] * x1 + diag[6] * x2 + diag[7] * x3; 173 yy[4 * i + 2] = diag[8] * x0 + diag[9] * x1 + diag[10] * x2 + diag[11] * x3; 174 yy[4 * i + 3] = diag[12] * x0 + diag[13] * x1 + diag[14] * x2 + diag[15] * x3; 175 diag += 16; 176 } 177 break; 178 case 5: 179 for (i = 0; i < m; i++) { 180 x0 = xx[5 * i]; 181 x1 = xx[5 * i + 1]; 182 x2 = xx[5 * i + 2]; 183 x3 = xx[5 * i + 3]; 184 x4 = xx[5 * i + 4]; 185 186 yy[5 * i] = diag[0] * x0 + diag[1] * x1 + diag[2] * x2 + diag[3] * x3 + diag[4] * x4; 187 yy[5 * i + 1] = diag[5] * x0 + diag[6] * x1 + diag[7] * x2 + diag[8] * x3 + diag[9] * x4; 188 yy[5 * i + 2] = diag[10] * x0 + diag[11] * x1 + diag[12] * x2 + diag[13] * x3 + diag[14] * x4; 189 yy[5 * i + 3] = diag[15] * x0 + diag[16] * x1 + diag[17] * x2 + diag[18] * x3 + diag[19] * x4; 190 yy[5 * i + 4] = diag[20] * x0 + diag[21] * x1 + diag[22] * x2 + diag[23] * x3 + diag[24] * x4; 191 diag += 25; 192 } 193 break; 194 case 6: 195 for (i = 0; i < m; i++) { 196 x0 = xx[6 * i]; 197 x1 = xx[6 * i + 1]; 198 x2 = xx[6 * i + 2]; 199 x3 = xx[6 * i + 3]; 200 x4 = xx[6 * i + 4]; 201 x5 = xx[6 * i + 5]; 202 203 yy[6 * i] = diag[0] * x0 + diag[1] * x1 + diag[2] * x2 + diag[3] * x3 + diag[4] * x4 + diag[5] * x5; 204 yy[6 * i + 1] = diag[6] * x0 + diag[7] * x1 + diag[8] * x2 + diag[9] * x3 + diag[10] * x4 + diag[11] * x5; 205 yy[6 * i + 2] = diag[12] * x0 + diag[13] * x1 + diag[14] * x2 + diag[15] * x3 + diag[16] * x4 + diag[17] * x5; 206 yy[6 * i + 3] = diag[18] * x0 + diag[19] * x1 + diag[20] * x2 + diag[21] * x3 + diag[22] * x4 + diag[23] * x5; 207 yy[6 * i + 4] = diag[24] * x0 + diag[25] * x1 + diag[26] * x2 + diag[27] * x3 + diag[28] * x4 + diag[29] * x5; 208 yy[6 * i + 5] = diag[30] * x0 + diag[31] * x1 + diag[32] * x2 + diag[33] * x3 + diag[34] * x4 + diag[35] * x5; 209 diag += 36; 210 } 211 break; 212 case 7: 213 for (i = 0; i < m; i++) { 214 x0 = xx[7 * i]; 215 x1 = xx[7 * i + 1]; 216 x2 = xx[7 * i + 2]; 217 x3 = xx[7 * i + 3]; 218 x4 = xx[7 * i + 4]; 219 x5 = xx[7 * i + 5]; 220 x6 = xx[7 * i + 6]; 221 222 yy[7 * i] = diag[0] * x0 + diag[1] * x1 + diag[2] * x2 + diag[3] * x3 + diag[4] * x4 + diag[5] * x5 + diag[6] * x6; 223 yy[7 * i + 1] = diag[7] * x0 + diag[8] * x1 + diag[9] * x2 + diag[10] * x3 + diag[11] * x4 + diag[12] * x5 + diag[13] * x6; 224 yy[7 * i + 2] = diag[14] * x0 + diag[15] * x1 + diag[16] * x2 + diag[17] * x3 + diag[18] * x4 + diag[19] * x5 + diag[20] * x6; 225 yy[7 * i + 3] = diag[21] * x0 + diag[22] * x1 + diag[23] * x2 + diag[24] * x3 + diag[25] * x4 + diag[26] * x5 + diag[27] * x6; 226 yy[7 * i + 4] = diag[28] * x0 + diag[29] * x1 + diag[30] * x2 + diag[31] * x3 + diag[32] * x4 + diag[33] * x5 + diag[34] * x6; 227 yy[7 * i + 5] = diag[35] * x0 + diag[36] * x1 + diag[37] * x2 + diag[38] * x3 + diag[39] * x4 + diag[40] * x5 + diag[41] * x6; 228 yy[7 * i + 6] = diag[42] * x0 + diag[43] * x1 + diag[44] * x2 + diag[45] * x3 + diag[46] * x4 + diag[47] * x5 + diag[48] * x6; 229 diag += 49; 230 } 231 break; 232 default: 233 for (i = 0; i < m; i++) { 234 for (ib = 0; ib < bs; ib++) { 235 PetscScalar rowsum = 0; 236 for (jb = 0; jb < bs; jb++) rowsum += diag[ib * bs + jb] * xx[bs * i + jb]; 237 yy[bs * i + ib] = rowsum; 238 } 239 diag += bs * bs; 240 } 241 } 242 PetscCall(VecRestoreArrayRead(x, &xx)); 243 PetscCall(VecRestoreArray(y, &yy)); 244 PetscCall(PetscLogFlops((2.0 * bs * bs - bs) * m)); 245 PetscFunctionReturn(PETSC_SUCCESS); 246 } 247 248 PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_Host(PC pc, Mat diagPB) 249 { 250 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 251 Mat A = diagPB ? diagPB : pc->pmat; 252 MatFactorError err; 253 PetscInt nlocal; 254 255 PetscFunctionBegin; 256 PetscCall(MatInvertBlockDiagonal(A, &jac->diag)); 257 PetscCall(MatFactorGetError(A, &err)); 258 if (err) pc->failedreason = (PCFailedReason)err; 259 260 PetscCall(MatGetBlockSize(A, &jac->bs)); 261 PetscCall(MatGetLocalSize(A, &nlocal, NULL)); 262 jac->mbs = nlocal / jac->bs; 263 PetscFunctionReturn(PETSC_SUCCESS); 264 } 265 266 static PetscErrorCode PCSetUp_PBJacobi(PC pc) 267 { 268 PetscBool flg; 269 Mat diagPB = NULL; 270 271 PetscFunctionBegin; 272 /* In PCCreate_PBJacobi() pmat might have not been set, so we wait to the last minute to do the dispatch */ 273 274 // pmat (e.g., MatCEED from libCEED) might have its own method to provide a matrix (diagPB) 275 // made of the diagonal blocks. So we check both pmat and diagVPB. 276 PetscCall(MatHasOperation(pc->pmat, MATOP_GET_BLOCK_DIAGONAL, &flg)); 277 if (flg) PetscUseTypeMethod(pc->pmat, getblockdiagonal, &diagPB); // diagPB's reference count is increased upon return 278 279 #if defined(PETSC_HAVE_CUDA) 280 PetscBool isCuda; 281 PetscCall(PetscObjectTypeCompareAny((PetscObject)pc->pmat, &isCuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, "")); 282 if (!isCuda && diagPB) PetscCall(PetscObjectTypeCompareAny((PetscObject)diagPB, &isCuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, "")); 283 #endif 284 #if defined(PETSC_HAVE_KOKKOS_KERNELS) 285 PetscBool isKok; 286 PetscCall(PetscObjectTypeCompareAny((PetscObject)pc->pmat, &isKok, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, "")); 287 if (!isKok && diagPB) PetscCall(PetscObjectTypeCompareAny((PetscObject)diagPB, &isKok, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, "")); 288 #endif 289 290 #if defined(PETSC_HAVE_CUDA) 291 if (isCuda) PetscCall(PCSetUp_PBJacobi_CUDA(pc, diagPB)); 292 else 293 #endif 294 #if defined(PETSC_HAVE_KOKKOS_KERNELS) 295 if (isKok) 296 PetscCall(PCSetUp_PBJacobi_Kokkos(pc, diagPB)); 297 else 298 #endif 299 { 300 PetscCall(PCSetUp_PBJacobi_Host(pc, diagPB)); 301 } 302 PetscCall(MatDestroy(&diagPB)); // since we don't need it anymore, we don't stash it in PC_PBJacobi 303 PetscFunctionReturn(PETSC_SUCCESS); 304 } 305 306 PetscErrorCode PCDestroy_PBJacobi(PC pc) 307 { 308 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 309 310 PetscFunctionBegin; 311 /* 312 Free the private data structure that was hanging off the PC 313 */ 314 // PetscCall(PetscFree(jac->diag)); // the memory is owned by e.g., a->ibdiag in Mat_SeqAIJ, so don't free it here. 315 PetscCall(MatDestroy(&jac->diagPB)); 316 PetscCall(PetscFree(pc->data)); 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 static PetscErrorCode PCView_PBJacobi(PC pc, PetscViewer viewer) 321 { 322 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 323 PetscBool isascii; 324 325 PetscFunctionBegin; 326 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii)); 327 if (isascii) PetscCall(PetscViewerASCIIPrintf(viewer, " point-block size %" PetscInt_FMT "\n", jac->bs)); 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 331 /*MC 332 PCPBJACOBI - Point block Jacobi preconditioner 333 334 Notes: 335 See `PCJACOBI` for diagonal Jacobi, `PCVPBJACOBI` for variable-size point block, and `PCBJACOBI` for large size blocks 336 337 This works for `MATAIJ` and `MATBAIJ` matrices and uses the blocksize provided to the matrix 338 339 Uses dense LU factorization with partial pivoting to invert the blocks; if a zero pivot 340 is detected a PETSc error is generated. 341 342 Developer Notes: 343 This should support the `PCSetErrorIfFailure()` flag set to `PETSC_TRUE` to allow 344 the factorization to continue even after a zero pivot is found resulting in a NaN and hence 345 terminating `KSP` with a `KSP_DIVERGED_NANORINF` allowing 346 a nonlinear solver/ODE integrator to recover without stopping the program as currently happens. 347 348 Perhaps should provide an option that allows generation of a valid preconditioner 349 even if a block is singular as the `PCJACOBI` does. 350 351 Level: beginner 352 353 .seealso: [](ch_ksp), `PCCreate()`, `PCSetType()`, `PCType`, `PC`, `PCJACOBI`, `PCVPBJACOBI`, `PCBJACOBI` 354 M*/ 355 356 PETSC_EXTERN PetscErrorCode PCCreate_PBJacobi(PC pc) 357 { 358 PC_PBJacobi *jac; 359 360 PetscFunctionBegin; 361 /* 362 Creates the private data structure for this preconditioner and 363 attach it to the PC object. 364 */ 365 PetscCall(PetscNew(&jac)); 366 pc->data = (void *)jac; 367 368 /* 369 Initialize the pointers to vectors to ZERO; these will be used to store 370 diagonal entries of the matrix for fast preconditioner application. 371 */ 372 jac->diag = NULL; 373 374 /* 375 Set the pointers for the functions that are provided above. 376 Now when the user-level routines (such as PCApply(), PCDestroy(), etc.) 377 are called, they will automatically call these functions. Note we 378 choose not to provide a couple of these functions since they are 379 not needed. 380 */ 381 pc->ops->apply = PCApply_PBJacobi; 382 pc->ops->applytranspose = PCApplyTranspose_PBJacobi; 383 pc->ops->setup = PCSetUp_PBJacobi; 384 pc->ops->destroy = PCDestroy_PBJacobi; 385 pc->ops->setfromoptions = NULL; 386 pc->ops->view = PCView_PBJacobi; 387 pc->ops->applyrichardson = NULL; 388 pc->ops->applysymmetricleft = NULL; 389 pc->ops->applysymmetricright = NULL; 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392