1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op, s, t) \ 16 do { \ 17 (s) = (s)op(t); \ 18 } while (0) /* binary ops in the middle such as +, *, && etc. */ 19 #define OP_FUNCTION(op, s, t) \ 20 do { \ 21 (s) = op((s), (t)); \ 22 } while (0) /* ops like a function, such as PetscMax, PetscMin */ 23 #define OP_LXOR(op, s, t) \ 24 do { \ 25 (s) = (!(s)) != (!(t)); \ 26 } while (0) /* logical exclusive OR */ 27 #define OP_ASSIGN(op, s, t) \ 28 do { \ 29 (s) = (t); \ 30 } while (0) 31 /* Ref MPI MAXLOC */ 32 #define OP_XLOC(op, s, t) \ 33 do { \ 34 if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \ 35 else if (!((s).u op(t).u)) s = t; \ 36 } while (0) 37 38 /* DEF_PackFunc - macro defining a Pack routine 39 40 Arguments of the macro: 41 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 42 .BS Block size for vectorization. It is a factor of bsz. 43 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 44 45 Arguments of the Pack routine: 46 +count Number of indices in idx[]. 47 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 48 .opt Per-pack optimization plan. NULL means no such plan. 49 .idx Indices of entries to packed. 50 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 51 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 52 -packed Address of the packed data. 53 */ 54 #define DEF_PackFunc(Type, BS, EQ) \ 55 static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \ 56 { \ 57 const Type *u = (const Type *)unpacked, *u2; \ 58 Type *p = (Type *)packed, *p2; \ 59 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 60 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 61 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 62 PetscFunctionBegin; \ 63 if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \ 64 else if (opt) { /* has optimizations available */ p2 = p; \ 65 for (r = 0; r < opt->n; r++) { \ 66 u2 = u + opt->start[r] * MBS; \ 67 X = opt->X[r]; \ 68 Y = opt->Y[r]; \ 69 for (k = 0; k < opt->dz[r]; k++) \ 70 for (j = 0; j < opt->dy[r]; j++) { \ 71 PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \ 72 p2 += opt->dx[r] * MBS; \ 73 } \ 74 } \ 75 } else { \ 76 for (i = 0; i < count; i++) \ 77 for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 78 for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 79 p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \ 80 } \ 81 PetscFunctionReturn(PETSC_SUCCESS); \ 82 } 83 84 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 85 and inserts into a sparse array. 86 87 Arguments: 88 .Type Type of the data 89 .BS Block size for vectorization 90 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 91 92 Notes: 93 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 94 */ 95 #define DEF_UnpackFunc(Type, BS, EQ) \ 96 static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \ 97 { \ 98 Type *u = (Type *)unpacked, *u2; \ 99 const Type *p = (const Type *)packed; \ 100 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 101 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 102 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 103 PetscFunctionBegin; \ 104 if (!idx) { \ 105 u += start * MBS; \ 106 if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \ 107 } else if (opt) { /* has optimizations available */ \ 108 for (r = 0; r < opt->n; r++) { \ 109 u2 = u + opt->start[r] * MBS; \ 110 X = opt->X[r]; \ 111 Y = opt->Y[r]; \ 112 for (k = 0; k < opt->dz[r]; k++) \ 113 for (j = 0; j < opt->dy[r]; j++) { \ 114 PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \ 115 p += opt->dx[r] * MBS; \ 116 } \ 117 } \ 118 } else { \ 119 for (i = 0; i < count; i++) \ 120 for (j = 0; j < M; j++) \ 121 for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \ 122 } \ 123 PetscFunctionReturn(PETSC_SUCCESS); \ 124 } 125 126 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 127 128 Arguments: 129 +Opname Name of the Op, such as Add, Mult, LAND, etc. 130 .Type Type of the data 131 .BS Block size for vectorization 132 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 133 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 134 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 135 */ 136 #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 137 static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \ 138 { \ 139 Type *u = (Type *)unpacked, *u2; \ 140 const Type *p = (const Type *)packed; \ 141 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 142 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 143 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 144 PetscFunctionBegin; \ 145 if (!idx) { \ 146 u += start * MBS; \ 147 for (i = 0; i < count; i++) \ 148 for (j = 0; j < M; j++) \ 149 for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r = 0; r < opt->n; r++) { \ 152 u2 = u + opt->start[r] * MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k = 0; k < opt->dz[r]; k++) \ 156 for (j = 0; j < opt->dy[r]; j++) { \ 157 for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \ 158 p += opt->dx[r] * MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i = 0; i < count; i++) \ 163 for (j = 0; j < M; j++) \ 164 for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 165 } \ 166 PetscFunctionReturn(PETSC_SUCCESS); \ 167 } 168 169 #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 170 static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \ 171 { \ 172 Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \ 173 PetscInt i, j, k, r, l, bs = link->bs; \ 174 const PetscInt M = (EQ) ? 1 : bs / BS; \ 175 const PetscInt MBS = M * BS; \ 176 PetscFunctionBegin; \ 177 for (i = 0; i < count; i++) { \ 178 r = (!idx ? start + i : idx[i]) * MBS; \ 179 l = i * MBS; \ 180 for (j = 0; j < M; j++) \ 181 for (k = 0; k < BS; k++) { \ 182 tmp = u[r + j * BS + k]; \ 183 OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \ 184 p[l + j * BS + k] = tmp; \ 185 } \ 186 } \ 187 PetscFunctionReturn(PETSC_SUCCESS); \ 188 } 189 190 #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 191 static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \ 192 { \ 193 const Type *u = (const Type *)src; \ 194 Type *v = (Type *)dst; \ 195 PetscInt i, j, k, s, t, X, Y, bs = link->bs; \ 196 const PetscInt M = (EQ) ? 1 : bs / BS; \ 197 const PetscInt MBS = M * BS; \ 198 PetscFunctionBegin; \ 199 if (!srcIdx) { /* src is contiguous */ \ 200 u += srcStart * MBS; \ 201 PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \ 202 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 203 u += srcOpt->start[0] * MBS; \ 204 v += dstStart * MBS; \ 205 X = srcOpt->X[0]; \ 206 Y = srcOpt->Y[0]; \ 207 for (k = 0; k < srcOpt->dz[0]; k++) \ 208 for (j = 0; j < srcOpt->dy[0]; j++) { \ 209 for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \ 210 v += srcOpt->dx[0] * MBS; \ 211 } \ 212 } else { /* all other cases */ \ 213 for (i = 0; i < count; i++) { \ 214 s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \ 215 t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \ 216 for (j = 0; j < M; j++) \ 217 for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \ 218 } \ 219 } \ 220 PetscFunctionReturn(PETSC_SUCCESS); \ 221 } 222 223 #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \ 224 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \ 225 { \ 226 Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \ 227 const Type *ldata = (const Type *)leafdata; \ 228 PetscInt i, j, k, r, l, bs = link->bs; \ 229 const PetscInt M = (EQ) ? 1 : bs / BS; \ 230 const PetscInt MBS = M * BS; \ 231 PetscFunctionBegin; \ 232 for (i = 0; i < count; i++) { \ 233 r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \ 234 l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \ 235 for (j = 0; j < M; j++) \ 236 for (k = 0; k < BS; k++) { \ 237 lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \ 238 OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \ 239 } \ 240 } \ 241 PetscFunctionReturn(PETSC_SUCCESS); \ 242 } 243 244 /* Pack, Unpack/Fetch ops */ 245 #define DEF_Pack(Type, BS, EQ) \ 246 DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \ 247 { \ 248 link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \ 249 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \ 250 link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \ 251 } 252 253 /* Add, Mult ops */ 254 #define DEF_Add(Type, BS, EQ) \ 255 DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \ 256 { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type, BS, EQ) \ 267 DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \ 268 { \ 269 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 270 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 271 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 272 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 273 } 274 275 /* Logical ops. 276 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 277 the compilation warning "empty macro arguments are undefined in ISO C90" 278 */ 279 #define DEF_Log(Type, BS, EQ) \ 280 DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \ 281 { \ 282 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \ 283 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \ 284 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \ 285 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \ 286 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \ 287 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \ 288 } 289 290 /* Bitwise ops */ 291 #define DEF_Bit(Type, BS, EQ) \ 292 DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \ 293 { \ 294 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \ 295 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \ 296 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \ 297 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \ 298 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \ 299 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \ 300 } 301 302 /* Maxloc, Minloc ops */ 303 #define DEF_Xloc(Type, BS, EQ) \ 304 DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \ 305 { \ 306 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 307 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 308 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 309 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 310 } 311 312 #define DEF_IntegerType(Type, BS, EQ) \ 313 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \ 314 { \ 315 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 316 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 317 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 318 CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \ 319 CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \ 320 } 321 322 #define DEF_RealType(Type, BS, EQ) \ 323 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \ 324 { \ 325 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 326 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 327 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 328 } 329 330 #if defined(PETSC_HAVE_COMPLEX) 331 #define DEF_ComplexType(Type, BS, EQ) \ 332 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \ 333 { \ 334 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 335 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 336 } 337 #endif 338 339 #define DEF_DumbType(Type, BS, EQ) \ 340 DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \ 341 { \ 342 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 343 } 344 345 /* Maxloc, Minloc */ 346 #define DEF_PairType(Type, BS, EQ) \ 347 DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \ 348 { \ 349 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 350 CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \ 351 } 352 353 DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */ 354 DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */ 355 DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */ 356 DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */ 357 DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */ 358 DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */ 359 DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */ 360 DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 361 362 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 363 DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0) 364 #endif 365 366 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 367 typedef signed char SignedChar; 368 DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0) 369 370 typedef unsigned char UnsignedChar; 371 DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0) 372 373 DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0) 374 #if defined(PETSC_HAVE_COMPLEX) 375 DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0) 376 #endif 377 378 #define PairType(Type1, Type2) Type1##_##Type2 379 typedef struct { 380 int u; 381 int i; 382 } PairType(int, int); 383 typedef struct { 384 PetscInt u; 385 PetscInt i; 386 } PairType(PetscInt, PetscInt); 387 DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1) 388 389 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 390 DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0) 391 392 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 393 DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0) 394 395 PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) 396 { 397 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 398 PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8; 399 400 PetscFunctionBegin; 401 /* Destroy device-specific fields */ 402 if (link->deviceinited) PetscCall((*link->Destroy)(sf, link)); 403 404 /* Destroy host related fields */ 405 if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 406 if (!link->use_nvshmem) { 407 for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */ 408 if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 409 } 410 PetscCall(PetscFree(link->reqs)); 411 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 412 PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 413 PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 414 } 415 } 416 PetscCall(PetscFree(link)); 417 PetscFunctionReturn(PETSC_SUCCESS); 418 } 419 420 PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 421 { 422 PetscFunctionBegin; 423 PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata)); 424 #if defined(PETSC_HAVE_NVSHMEM) 425 { 426 PetscBool use_nvshmem; 427 PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem)); 428 if (use_nvshmem) { 429 PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 } 433 #endif 434 PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 435 PetscFunctionReturn(PETSC_SUCCESS); 436 } 437 438 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 439 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 440 */ 441 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) 442 { 443 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 444 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 445 const PetscInt *rootoffset, *leafoffset; 446 MPI_Aint disp; 447 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 448 MPI_Datatype unit = link->unit; 449 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 450 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 451 452 PetscFunctionBegin; 453 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 454 if (sf->persistent) { 455 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 456 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 457 if (direction == PETSCSF_LEAF2ROOT) { 458 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 459 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 460 cnt = rootoffset[i + 1] - rootoffset[i]; 461 PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 462 } 463 } else { /* PETSCSF_ROOT2LEAF */ 464 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 465 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 466 cnt = rootoffset[i + 1] - rootoffset[i]; 467 PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 468 } 469 } 470 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 471 } 472 473 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 474 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 475 if (direction == PETSCSF_LEAF2ROOT) { 476 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 477 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 478 cnt = leafoffset[i + 1] - leafoffset[i]; 479 PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 480 } 481 } else { /* PETSCSF_ROOT2LEAF */ 482 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 483 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 484 cnt = leafoffset[i + 1] - leafoffset[i]; 485 PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 486 } 487 } 488 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 489 } 490 } 491 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 492 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 493 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 494 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 495 PetscFunctionReturn(PETSC_SUCCESS); 496 } 497 498 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) 499 { 500 PetscSFLink link, *p; 501 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 502 503 PetscFunctionBegin; 504 /* Look for types in cache */ 505 for (p = &bas->inuse; (link = *p); p = &link->next) { 506 PetscBool match; 507 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 508 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 509 switch (cmode) { 510 case PETSC_OWN_POINTER: 511 *p = link->next; 512 break; /* Remove from inuse list */ 513 case PETSC_USE_POINTER: 514 break; 515 default: 516 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode"); 517 } 518 *mylink = link; 519 PetscFunctionReturn(PETSC_SUCCESS); 520 } 521 } 522 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack"); 523 } 524 525 PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) 526 { 527 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 528 PetscSFLink link = *mylink; 529 530 PetscFunctionBegin; 531 link->rootdata = NULL; 532 link->leafdata = NULL; 533 link->next = bas->avail; 534 bas->avail = link; 535 *mylink = NULL; 536 PetscFunctionReturn(PETSC_SUCCESS); 537 } 538 539 /* Error out on unsupported overlapped communications */ 540 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) 541 { 542 PetscSFLink link, *p; 543 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 544 PetscBool match; 545 546 PetscFunctionBegin; 547 if (PetscDefined(USE_DEBUG)) { 548 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 549 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 550 */ 551 if (rootdata || leafdata) { 552 for (p = &bas->inuse; (link = *p); p = &link->next) { 553 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 554 PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata); 555 } 556 } 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) 562 { 563 PetscFunctionBegin; 564 if (n) PetscCall(PetscMemcpy(dst, src, n)); 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 568 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) 569 { 570 PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0; 571 PetscBool is2Int, is2PetscInt; 572 PetscMPIInt ni, na, nd, combiner; 573 #if defined(PETSC_HAVE_COMPLEX) 574 PetscInt nPetscComplex = 0; 575 #endif 576 577 PetscFunctionBegin; 578 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar)); 579 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar)); 580 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 581 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt)); 582 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt)); 583 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal)); 584 #if defined(PETSC_HAVE_COMPLEX) 585 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex)); 586 #endif 587 PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int)); 588 PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt)); 589 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 590 PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner)); 591 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 592 link->bs = 1; /* default */ 593 594 if (is2Int) { 595 PackInit_PairType_int_int_1_1(link); 596 link->bs = 1; 597 link->unitbytes = 2 * sizeof(int); 598 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 599 link->basicunit = MPI_2INT; 600 link->unit = MPI_2INT; 601 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 602 PackInit_PairType_PetscInt_PetscInt_1_1(link); 603 link->bs = 1; 604 link->unitbytes = 2 * sizeof(PetscInt); 605 link->basicunit = MPIU_2INT; 606 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 607 link->unit = MPIU_2INT; 608 } else if (nPetscReal) { 609 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); 610 else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link); 611 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); 612 else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link); 613 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); 614 else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link); 615 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); 616 else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link); 617 link->bs = nPetscReal; 618 link->unitbytes = nPetscReal * sizeof(PetscReal); 619 link->basicunit = MPIU_REAL; 620 if (link->bs == 1) { 621 link->isbuiltin = PETSC_TRUE; 622 link->unit = MPIU_REAL; 623 } 624 } else if (nPetscInt) { 625 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); 626 else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 627 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); 628 else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 629 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); 630 else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 631 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); 632 else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 633 link->bs = nPetscInt; 634 link->unitbytes = nPetscInt * sizeof(PetscInt); 635 link->basicunit = MPIU_INT; 636 if (link->bs == 1) { 637 link->isbuiltin = PETSC_TRUE; 638 link->unit = MPIU_INT; 639 } 640 #if defined(PETSC_USE_64BIT_INDICES) 641 } else if (nInt) { 642 if (nInt == 8) PackInit_IntegerType_int_8_1(link); 643 else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link); 644 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); 645 else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link); 646 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); 647 else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link); 648 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); 649 else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link); 650 link->bs = nInt; 651 link->unitbytes = nInt * sizeof(int); 652 link->basicunit = MPI_INT; 653 if (link->bs == 1) { 654 link->isbuiltin = PETSC_TRUE; 655 link->unit = MPI_INT; 656 } 657 #endif 658 } else if (nSignedChar) { 659 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); 660 else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 661 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); 662 else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 663 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); 664 else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 665 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); 666 else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 667 link->bs = nSignedChar; 668 link->unitbytes = nSignedChar * sizeof(SignedChar); 669 link->basicunit = MPI_SIGNED_CHAR; 670 if (link->bs == 1) { 671 link->isbuiltin = PETSC_TRUE; 672 link->unit = MPI_SIGNED_CHAR; 673 } 674 } else if (nUnsignedChar) { 675 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); 676 else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 677 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); 678 else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 679 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); 680 else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 681 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); 682 else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 683 link->bs = nUnsignedChar; 684 link->unitbytes = nUnsignedChar * sizeof(UnsignedChar); 685 link->basicunit = MPI_UNSIGNED_CHAR; 686 if (link->bs == 1) { 687 link->isbuiltin = PETSC_TRUE; 688 link->unit = MPI_UNSIGNED_CHAR; 689 } 690 #if defined(PETSC_HAVE_COMPLEX) 691 } else if (nPetscComplex) { 692 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); 693 else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 694 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); 695 else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 696 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); 697 else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 698 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); 699 else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 700 link->bs = nPetscComplex; 701 link->unitbytes = nPetscComplex * sizeof(PetscComplex); 702 link->basicunit = MPIU_COMPLEX; 703 if (link->bs == 1) { 704 link->isbuiltin = PETSC_TRUE; 705 link->unit = MPIU_COMPLEX; 706 } 707 #endif 708 } else { 709 MPI_Aint lb, nbyte; 710 PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte)); 711 PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 712 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 713 if (nbyte == 4) PackInit_DumbType_char_4_1(link); 714 else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link); 715 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); 716 else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link); 717 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); 718 else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link); 719 link->bs = nbyte; 720 link->unitbytes = nbyte; 721 link->basicunit = MPI_BYTE; 722 } else { 723 nInt = nbyte / sizeof(int); 724 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); 725 else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link); 726 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); 727 else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link); 728 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); 729 else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link); 730 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); 731 else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link); 732 link->bs = nInt; 733 link->unitbytes = nbyte; 734 link->basicunit = MPI_INT; 735 } 736 if (link->isbuiltin) link->unit = unit; 737 } 738 739 if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit)); 740 741 link->Memcpy = PetscSFLinkMemcpy_Host; 742 PetscFunctionReturn(PETSC_SUCCESS); 743 } 744 745 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) 746 { 747 PetscFunctionBegin; 748 *UnpackAndOp = NULL; 749 if (PetscMemTypeHost(mtype)) { 750 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 751 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 752 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 753 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 754 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 755 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 756 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 757 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 758 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 759 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 760 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 761 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 762 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 763 } 764 #if defined(PETSC_HAVE_DEVICE) 765 else if (PetscMemTypeDevice(mtype) && !atomic) { 766 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 767 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 768 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 769 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 770 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 771 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 772 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 773 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 774 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 775 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 776 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 777 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 778 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 779 } else if (PetscMemTypeDevice(mtype) && atomic) { 780 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 781 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 782 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 783 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 784 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 785 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 786 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 787 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 788 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 789 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 790 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 791 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 792 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 793 } 794 #endif 795 PetscFunctionReturn(PETSC_SUCCESS); 796 } 797 798 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) 799 { 800 PetscFunctionBegin; 801 *ScatterAndOp = NULL; 802 if (PetscMemTypeHost(mtype)) { 803 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 804 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 805 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 806 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 807 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 808 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 809 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 810 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 811 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 812 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 813 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 814 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 815 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 816 } 817 #if defined(PETSC_HAVE_DEVICE) 818 else if (PetscMemTypeDevice(mtype) && !atomic) { 819 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 820 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 821 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 822 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 823 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 824 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 825 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 826 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 827 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 828 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 829 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 830 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 831 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 832 } else if (PetscMemTypeDevice(mtype) && atomic) { 833 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 834 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 835 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 836 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 837 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 838 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 839 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 840 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 841 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 842 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 843 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 844 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 845 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 846 } 847 #endif 848 PetscFunctionReturn(PETSC_SUCCESS); 849 } 850 851 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) 852 { 853 PetscFunctionBegin; 854 *FetchAndOp = NULL; 855 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 856 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 857 #if defined(PETSC_HAVE_DEVICE) 858 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 859 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 860 #endif 861 PetscFunctionReturn(PETSC_SUCCESS); 862 } 863 864 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) 865 { 866 PetscFunctionBegin; 867 *FetchAndOpLocal = NULL; 868 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 869 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 870 #if defined(PETSC_HAVE_DEVICE) 871 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 872 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 873 #endif 874 PetscFunctionReturn(PETSC_SUCCESS); 875 } 876 877 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) 878 { 879 PetscLogDouble flops; 880 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 881 882 PetscFunctionBegin; 883 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 884 flops = bas->rootbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 885 #if defined(PETSC_HAVE_DEVICE) 886 if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); 887 else 888 #endif 889 PetscCall(PetscLogFlops(flops)); 890 } 891 PetscFunctionReturn(PETSC_SUCCESS); 892 } 893 894 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) 895 { 896 PetscLogDouble flops; 897 898 PetscFunctionBegin; 899 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 900 flops = sf->leafbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 901 #if defined(PETSC_HAVE_DEVICE) 902 if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); 903 else 904 #endif 905 PetscCall(PetscLogFlops(flops)); 906 } 907 PetscFunctionReturn(PETSC_SUCCESS); 908 } 909 910 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 911 Input Parameters: 912 +sf - The StarForest 913 .link - The link 914 .count - Number of entries to unpack 915 .start - The first index, significant when indices=NULL 916 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 917 .buf - A contiguous buffer to unpack from 918 -op - Operation after unpack 919 920 Output Parameters: 921 .data - The data to unpack to 922 */ 923 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) 924 { 925 PetscFunctionBegin; 926 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 927 { 928 PetscInt i; 929 if (indices) { 930 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 931 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 932 */ 933 for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op)); 934 } else { 935 PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op)); 936 } 937 } 938 PetscFunctionReturn(PETSC_SUCCESS); 939 #else 940 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 941 #endif 942 } 943 944 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) 945 { 946 PetscFunctionBegin; 947 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 948 { 949 PetscInt i, disp; 950 if (!srcIdx) { 951 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op)); 952 } else { 953 for (i = 0; i < count; i++) { 954 disp = dstIdx ? dstIdx[i] : dstStart + i; 955 PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op)); 956 } 957 } 958 } 959 PetscFunctionReturn(PETSC_SUCCESS); 960 #else 961 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 962 #endif 963 } 964 965 /*============================================================================= 966 Pack/Unpack/Fetch/Scatter routines 967 ============================================================================*/ 968 969 /* Pack rootdata to rootbuf 970 Input Parameters: 971 + sf - The SF this packing works on. 972 . link - It gives the memtype of the roots and also provides root buffer. 973 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 974 - rootdata - Where to read the roots. 975 976 Notes: 977 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 978 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 979 */ 980 static PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) 981 { 982 const PetscInt *rootindices = NULL; 983 PetscInt count, start; 984 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 985 PetscMemType rootmtype = link->rootmtype; 986 PetscSFPackOpt opt = NULL; 987 988 PetscFunctionBegin; 989 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 990 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 991 PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack)); 992 PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 993 } 994 PetscFunctionReturn(PETSC_SUCCESS); 995 } 996 997 /* Pack leafdata to leafbuf */ 998 static PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) 999 { 1000 const PetscInt *leafindices = NULL; 1001 PetscInt count, start; 1002 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1003 PetscMemType leafmtype = link->leafmtype; 1004 PetscSFPackOpt opt = NULL; 1005 1006 PetscFunctionBegin; 1007 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1008 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 1009 PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack)); 1010 PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1011 } 1012 PetscFunctionReturn(PETSC_SUCCESS); 1013 } 1014 1015 /* Pack rootdata to rootbuf, which are in the same memory space */ 1016 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) 1017 { 1018 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1019 1020 PetscFunctionBegin; 1021 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1022 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1023 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1024 } 1025 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 1026 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata)); 1027 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 1028 PetscFunctionReturn(PETSC_SUCCESS); 1029 } 1030 /* Pack leafdata to leafbuf, which are in the same memory space */ 1031 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) 1032 { 1033 PetscFunctionBegin; 1034 if (scope == PETSCSF_REMOTE) { 1035 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1036 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1037 } 1038 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 1039 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata)); 1040 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 1041 PetscFunctionReturn(PETSC_SUCCESS); 1042 } 1043 1044 static PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) 1045 { 1046 const PetscInt *rootindices = NULL; 1047 PetscInt count, start; 1048 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1049 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1050 PetscMemType rootmtype = link->rootmtype; 1051 PetscSFPackOpt opt = NULL; 1052 1053 PetscFunctionBegin; 1054 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1055 PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp)); 1056 if (UnpackAndOp) { 1057 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 1058 PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 1059 } else { 1060 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices)); 1061 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op)); 1062 } 1063 } 1064 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op)); 1065 PetscFunctionReturn(PETSC_SUCCESS); 1066 } 1067 1068 static PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) 1069 { 1070 const PetscInt *leafindices = NULL; 1071 PetscInt count, start; 1072 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1073 PetscMemType leafmtype = link->leafmtype; 1074 PetscSFPackOpt opt = NULL; 1075 1076 PetscFunctionBegin; 1077 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1078 PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp)); 1079 if (UnpackAndOp) { 1080 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 1081 PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1082 } else { 1083 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices)); 1084 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op)); 1085 } 1086 } 1087 PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op)); 1088 PetscFunctionReturn(PETSC_SUCCESS); 1089 } 1090 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1091 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) 1092 { 1093 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1094 1095 PetscFunctionBegin; 1096 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1097 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op)); 1098 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1099 if (scope == PETSCSF_REMOTE) { 1100 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1101 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1102 } 1103 PetscFunctionReturn(PETSC_SUCCESS); 1104 } 1105 1106 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1107 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) 1108 { 1109 PetscFunctionBegin; 1110 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1111 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op)); 1112 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1113 if (scope == PETSCSF_REMOTE) { 1114 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1115 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1116 } 1117 PetscFunctionReturn(PETSC_SUCCESS); 1118 } 1119 1120 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1121 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) 1122 { 1123 const PetscInt *rootindices = NULL; 1124 PetscInt count, start; 1125 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1126 PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL; 1127 PetscMemType rootmtype = link->rootmtype; 1128 PetscSFPackOpt opt = NULL; 1129 1130 PetscFunctionBegin; 1131 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1132 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1133 /* Do FetchAndOp on rootdata with rootbuf */ 1134 PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp)); 1135 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices)); 1136 PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1137 } 1138 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op)); 1139 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1140 PetscFunctionReturn(PETSC_SUCCESS); 1141 } 1142 1143 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) 1144 { 1145 const PetscInt *rootindices = NULL, *leafindices = NULL; 1146 PetscInt count, rootstart, leafstart; 1147 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1148 PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL; 1149 PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype; 1150 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1151 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1152 char *srcbuf = NULL, *dstbuf = NULL; 1153 PetscBool dstdups; 1154 1155 PetscFunctionBegin; 1156 if (!buflen) PetscFunctionReturn(PETSC_SUCCESS); 1157 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1158 if (direction == PETSCSF_ROOT2LEAF) { 1159 PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata)); 1160 srcmtype = rootmtype; 1161 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1162 dstmtype = leafmtype; 1163 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1164 } else { 1165 PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata)); 1166 srcmtype = leafmtype; 1167 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1168 dstmtype = rootmtype; 1169 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1170 } 1171 PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes)); 1172 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1173 if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 1174 if (direction == PETSCSF_ROOT2LEAF) { 1175 PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op)); 1176 } else { 1177 PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op)); 1178 } 1179 } else { 1180 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1181 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1182 PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp)); 1183 if (ScatterAndOp) { 1184 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1185 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1186 if (direction == PETSCSF_ROOT2LEAF) { 1187 PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata)); 1188 } else { 1189 PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata)); 1190 } 1191 } else { 1192 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1193 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1194 if (direction == PETSCSF_ROOT2LEAF) { 1195 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op)); 1196 } else { 1197 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op)); 1198 } 1199 } 1200 } 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 1204 /* Fetch rootdata to leafdata and leafupdate locally */ 1205 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) 1206 { 1207 const PetscInt *rootindices = NULL, *leafindices = NULL; 1208 PetscInt count, rootstart, leafstart; 1209 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1210 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1211 const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype; 1212 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1213 1214 PetscFunctionBegin; 1215 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS); 1216 if (rootmtype != leafmtype) { 1217 /* The local communication has to go through pack and unpack */ 1218 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1219 } else { 1220 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1221 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1222 PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal)); 1223 PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate)); 1224 } 1225 PetscFunctionReturn(PETSC_SUCCESS); 1226 } 1227 1228 /* 1229 Create per-rank pack/unpack optimizations based on indices patterns 1230 1231 Input Parameters: 1232 + n - Number of destination ranks 1233 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1234 - idx - [*] Array storing indices 1235 1236 Output Parameters: 1237 + opt - Pack optimizations. NULL if no optimizations. 1238 */ 1239 static PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) 1240 { 1241 PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y; 1242 PetscBool optimizable = PETSC_TRUE; 1243 PetscSFPackOpt opt; 1244 1245 PetscFunctionBegin; 1246 PetscCall(PetscMalloc1(1, &opt)); 1247 PetscCall(PetscMalloc1(7 * n + 2, &opt->array)); 1248 opt->n = opt->array[0] = n; 1249 opt->offset = opt->array + 1; 1250 opt->start = opt->array + n + 2; 1251 opt->dx = opt->array + 2 * n + 2; 1252 opt->dy = opt->array + 3 * n + 2; 1253 opt->dz = opt->array + 4 * n + 2; 1254 opt->X = opt->array + 5 * n + 2; 1255 opt->Y = opt->array + 6 * n + 2; 1256 1257 for (r = 0; r < n; r++) { /* For each destination rank */ 1258 m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1259 p = offset[r]; 1260 start = idx[p]; /* First index for this rank */ 1261 p++; 1262 1263 /* Search in X dimension */ 1264 for (dx = 1; dx < m; dx++, p++) { 1265 if (start + dx != idx[p]) break; 1266 } 1267 1268 dydz = m / dx; 1269 X = dydz > 1 ? (idx[p] - start) : dx; 1270 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1271 if (m % dx || X <= 0) { 1272 optimizable = PETSC_FALSE; 1273 goto finish; 1274 } 1275 for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */ 1276 for (i = 0; i < dx; i++, p++) { 1277 if (start + X * dy + i != idx[p]) { 1278 if (i) { 1279 optimizable = PETSC_FALSE; 1280 goto finish; 1281 } /* The pattern is violated in the middle of an x-walk */ 1282 else 1283 goto Z_dimension; 1284 } 1285 } 1286 } 1287 1288 Z_dimension: 1289 dz = m / (dx * dy); 1290 Y = dz > 1 ? (idx[p] - start) / X : dy; 1291 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1292 if (m % (dx * dy) || Y <= 0) { 1293 optimizable = PETSC_FALSE; 1294 goto finish; 1295 } 1296 for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1297 for (j = 0; j < dy; j++) { 1298 for (i = 0; i < dx; i++, p++) { 1299 if (start + X * Y * k + X * j + i != idx[p]) { 1300 optimizable = PETSC_FALSE; 1301 goto finish; 1302 } 1303 } 1304 } 1305 } 1306 opt->start[r] = start; 1307 opt->dx[r] = dx; 1308 opt->dy[r] = dy; 1309 opt->dz[r] = dz; 1310 opt->X[r] = X; 1311 opt->Y[r] = Y; 1312 } 1313 1314 finish: 1315 /* If not optimizable, free arrays to save memory */ 1316 if (!n || !optimizable) { 1317 PetscCall(PetscFree(opt->array)); 1318 PetscCall(PetscFree(opt)); 1319 *out = NULL; 1320 } else { 1321 opt->offset[0] = 0; 1322 for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r]; 1323 *out = opt; 1324 } 1325 PetscFunctionReturn(PETSC_SUCCESS); 1326 } 1327 1328 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) 1329 { 1330 PetscSFPackOpt opt = *out; 1331 1332 PetscFunctionBegin; 1333 if (opt) { 1334 PetscCall(PetscSFFree(sf, mtype, opt->array)); 1335 PetscCall(PetscFree(opt)); 1336 *out = NULL; 1337 } 1338 PetscFunctionReturn(PETSC_SUCCESS); 1339 } 1340 1341 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1342 { 1343 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1344 PetscInt i, j; 1345 1346 PetscFunctionBegin; 1347 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1348 for (i = 0; i < 2; i++) { /* Set defaults */ 1349 sf->leafstart[i] = 0; 1350 sf->leafcontig[i] = PETSC_TRUE; 1351 sf->leafdups[i] = PETSC_FALSE; 1352 bas->rootstart[i] = 0; 1353 bas->rootcontig[i] = PETSC_TRUE; 1354 bas->rootdups[i] = PETSC_FALSE; 1355 } 1356 1357 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1358 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1359 1360 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1361 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1362 1363 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1364 for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */ 1365 if (sf->rmine[i] != sf->leafstart[0] + i) { 1366 sf->leafcontig[0] = PETSC_FALSE; 1367 break; 1368 } 1369 } 1370 for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */ 1371 if (sf->rmine[i] != sf->leafstart[1] + j) { 1372 sf->leafcontig[1] = PETSC_FALSE; 1373 break; 1374 } 1375 } 1376 1377 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1378 if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 1379 if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1380 1381 /* Are root indices for self and remote contiguous? */ 1382 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1383 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1384 1385 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1386 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1387 1388 for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) { 1389 if (bas->irootloc[i] != bas->rootstart[0] + i) { 1390 bas->rootcontig[0] = PETSC_FALSE; 1391 break; 1392 } 1393 } 1394 for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) { 1395 if (bas->irootloc[i] != bas->rootstart[1] + j) { 1396 bas->rootcontig[1] = PETSC_FALSE; 1397 break; 1398 } 1399 } 1400 1401 if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 1402 if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1403 1404 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1405 if (PetscDefined(HAVE_DEVICE)) { 1406 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1407 if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 1408 if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1])); 1409 if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 1410 if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1411 } 1412 PetscFunctionReturn(PETSC_SUCCESS); 1413 } 1414 1415 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1416 { 1417 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1418 PetscInt i; 1419 1420 PetscFunctionBegin; 1421 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 1422 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i])); 1423 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i])); 1424 #if defined(PETSC_HAVE_DEVICE) 1425 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i])); 1426 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i])); 1427 #endif 1428 } 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431