1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op, s, t) \ 16 do { (s) = (s)op(t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 17 #define OP_FUNCTION(op, s, t) \ 18 do { (s) = op((s), (t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 19 #define OP_LXOR(op, s, t) \ 20 do { (s) = (!(s)) != (!(t)); } while (0) /* logical exclusive OR */ 21 #define OP_ASSIGN(op, s, t) \ 22 do { (s) = (t); } while (0) 23 /* Ref MPI MAXLOC */ 24 #define OP_XLOC(op, s, t) \ 25 do { \ 26 if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \ 27 else if (!((s).u op(t).u)) s = t; \ 28 } while (0) 29 30 /* DEF_PackFunc - macro defining a Pack routine 31 32 Arguments of the macro: 33 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 34 .BS Block size for vectorization. It is a factor of bsz. 35 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 36 37 Arguments of the Pack routine: 38 +count Number of indices in idx[]. 39 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 40 .opt Per-pack optimization plan. NULL means no such plan. 41 .idx Indices of entries to packed. 42 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 43 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 44 -packed Address of the packed data. 45 */ 46 #define DEF_PackFunc(Type, BS, EQ) \ 47 static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) { \ 48 const Type *u = (const Type *)unpacked, *u2; \ 49 Type *p = (Type *)packed, *p2; \ 50 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 51 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 52 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 53 PetscFunctionBegin; \ 54 if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \ 55 else if (opt) { /* has optimizations available */ p2 = p; \ 56 for (r = 0; r < opt->n; r++) { \ 57 u2 = u + opt->start[r] * MBS; \ 58 X = opt->X[r]; \ 59 Y = opt->Y[r]; \ 60 for (k = 0; k < opt->dz[r]; k++) \ 61 for (j = 0; j < opt->dy[r]; j++) { \ 62 PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \ 63 p2 += opt->dx[r] * MBS; \ 64 } \ 65 } \ 66 } else { \ 67 for (i = 0; i < count; i++) \ 68 for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 69 for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 70 p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \ 71 } \ 72 PetscFunctionReturn(0); \ 73 } 74 75 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 76 and inserts into a sparse array. 77 78 Arguments: 79 .Type Type of the data 80 .BS Block size for vectorization 81 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 82 83 Notes: 84 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 85 */ 86 #define DEF_UnpackFunc(Type, BS, EQ) \ 87 static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \ 88 Type *u = (Type *)unpacked, *u2; \ 89 const Type *p = (const Type *)packed; \ 90 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 91 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 92 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 93 PetscFunctionBegin; \ 94 if (!idx) { \ 95 u += start * MBS; \ 96 if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \ 97 } else if (opt) { /* has optimizations available */ \ 98 for (r = 0; r < opt->n; r++) { \ 99 u2 = u + opt->start[r] * MBS; \ 100 X = opt->X[r]; \ 101 Y = opt->Y[r]; \ 102 for (k = 0; k < opt->dz[r]; k++) \ 103 for (j = 0; j < opt->dy[r]; j++) { \ 104 PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \ 105 p += opt->dx[r] * MBS; \ 106 } \ 107 } \ 108 } else { \ 109 for (i = 0; i < count; i++) \ 110 for (j = 0; j < M; j++) \ 111 for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \ 112 } \ 113 PetscFunctionReturn(0); \ 114 } 115 116 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 117 118 Arguments: 119 +Opname Name of the Op, such as Add, Mult, LAND, etc. 120 .Type Type of the data 121 .BS Block size for vectorization 122 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 123 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 124 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 125 */ 126 #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 127 static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \ 128 Type *u = (Type *)unpacked, *u2; \ 129 const Type *p = (const Type *)packed; \ 130 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 131 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 132 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 133 PetscFunctionBegin; \ 134 if (!idx) { \ 135 u += start * MBS; \ 136 for (i = 0; i < count; i++) \ 137 for (j = 0; j < M; j++) \ 138 for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 139 } else if (opt) { /* idx[] has patterns */ \ 140 for (r = 0; r < opt->n; r++) { \ 141 u2 = u + opt->start[r] * MBS; \ 142 X = opt->X[r]; \ 143 Y = opt->Y[r]; \ 144 for (k = 0; k < opt->dz[r]; k++) \ 145 for (j = 0; j < opt->dy[r]; j++) { \ 146 for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \ 147 p += opt->dx[r] * MBS; \ 148 } \ 149 } \ 150 } else { \ 151 for (i = 0; i < count; i++) \ 152 for (j = 0; j < M; j++) \ 153 for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 154 } \ 155 PetscFunctionReturn(0); \ 156 } 157 158 #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 159 static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) { \ 160 Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \ 161 PetscInt i, j, k, r, l, bs = link->bs; \ 162 const PetscInt M = (EQ) ? 1 : bs / BS; \ 163 const PetscInt MBS = M * BS; \ 164 PetscFunctionBegin; \ 165 for (i = 0; i < count; i++) { \ 166 r = (!idx ? start + i : idx[i]) * MBS; \ 167 l = i * MBS; \ 168 for (j = 0; j < M; j++) \ 169 for (k = 0; k < BS; k++) { \ 170 tmp = u[r + j * BS + k]; \ 171 OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \ 172 p[l + j * BS + k] = tmp; \ 173 } \ 174 } \ 175 PetscFunctionReturn(0); \ 176 } 177 178 #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 179 static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) { \ 180 const Type *u = (const Type *)src; \ 181 Type *v = (Type *)dst; \ 182 PetscInt i, j, k, s, t, X, Y, bs = link->bs; \ 183 const PetscInt M = (EQ) ? 1 : bs / BS; \ 184 const PetscInt MBS = M * BS; \ 185 PetscFunctionBegin; \ 186 if (!srcIdx) { /* src is contiguous */ \ 187 u += srcStart * MBS; \ 188 PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \ 189 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 190 u += srcOpt->start[0] * MBS; \ 191 v += dstStart * MBS; \ 192 X = srcOpt->X[0]; \ 193 Y = srcOpt->Y[0]; \ 194 for (k = 0; k < srcOpt->dz[0]; k++) \ 195 for (j = 0; j < srcOpt->dy[0]; j++) { \ 196 for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \ 197 v += srcOpt->dx[0] * MBS; \ 198 } \ 199 } else { /* all other cases */ \ 200 for (i = 0; i < count; i++) { \ 201 s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \ 202 t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \ 203 for (j = 0; j < M; j++) \ 204 for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \ 205 } \ 206 } \ 207 PetscFunctionReturn(0); \ 208 } 209 210 #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \ 211 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) { \ 212 Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \ 213 const Type *ldata = (const Type *)leafdata; \ 214 PetscInt i, j, k, r, l, bs = link->bs; \ 215 const PetscInt M = (EQ) ? 1 : bs / BS; \ 216 const PetscInt MBS = M * BS; \ 217 PetscFunctionBegin; \ 218 for (i = 0; i < count; i++) { \ 219 r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \ 220 l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \ 221 for (j = 0; j < M; j++) \ 222 for (k = 0; k < BS; k++) { \ 223 lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \ 224 OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \ 225 } \ 226 } \ 227 PetscFunctionReturn(0); \ 228 } 229 230 /* Pack, Unpack/Fetch ops */ 231 #define DEF_Pack(Type, BS, EQ) \ 232 DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) { \ 233 link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \ 234 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \ 235 link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \ 236 } 237 238 /* Add, Mult ops */ 239 #define DEF_Add(Type, BS, EQ) \ 240 DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) { \ 241 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \ 242 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \ 243 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \ 244 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \ 245 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \ 246 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \ 247 } 248 249 /* Max, Min ops */ 250 #define DEF_Cmp(Type, BS, EQ) \ 251 DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) { \ 252 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 253 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 254 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 255 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 256 } 257 258 /* Logical ops. 259 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 260 the compilation warning "empty macro arguments are undefined in ISO C90" 261 */ 262 #define DEF_Log(Type, BS, EQ) \ 263 DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) { \ 264 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \ 265 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \ 266 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \ 267 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \ 268 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \ 269 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \ 270 } 271 272 /* Bitwise ops */ 273 #define DEF_Bit(Type, BS, EQ) \ 274 DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) { \ 275 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \ 276 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \ 277 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \ 278 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \ 279 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \ 280 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \ 281 } 282 283 /* Maxloc, Minloc ops */ 284 #define DEF_Xloc(Type, BS, EQ) \ 285 DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) { \ 286 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 287 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 288 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 289 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 290 } 291 292 #define DEF_IntegerType(Type, BS, EQ) \ 293 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) { \ 294 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 295 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 296 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 297 CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \ 298 CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \ 299 } 300 301 #define DEF_RealType(Type, BS, EQ) \ 302 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) { \ 303 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 304 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 305 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 306 } 307 308 #if defined(PETSC_HAVE_COMPLEX) 309 #define DEF_ComplexType(Type, BS, EQ) \ 310 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) { \ 311 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 312 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 313 } 314 #endif 315 316 #define DEF_DumbType(Type, BS, EQ) \ 317 DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) { \ 318 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 319 } 320 321 /* Maxloc, Minloc */ 322 #define DEF_PairType(Type, BS, EQ) \ 323 DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) { \ 324 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 325 CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \ 326 } 327 328 DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */ 329 DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */ 330 DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */ 331 DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */ 332 DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */ 333 DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */ 334 DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */ 335 DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 336 337 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 338 DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0) 339 #endif 340 341 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 342 typedef signed char SignedChar; 343 DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0) 344 345 typedef unsigned char UnsignedChar; 346 DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0) 347 348 DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0) 349 #if defined(PETSC_HAVE_COMPLEX) 350 DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0) 351 #endif 352 353 #define PairType(Type1, Type2) Type1##_##Type2 354 typedef struct { 355 int u; 356 int i; 357 } PairType(int, int); 358 typedef struct { 359 PetscInt u; 360 PetscInt i; 361 } PairType(PetscInt, PetscInt); 362 DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1) 363 364 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 365 DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0) 366 367 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 368 DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0) 369 370 PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) { 371 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 372 PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8; 373 374 PetscFunctionBegin; 375 /* Destroy device-specific fields */ 376 if (link->deviceinited) PetscCall((*link->Destroy)(sf, link)); 377 378 /* Destroy host related fields */ 379 if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 380 if (!link->use_nvshmem) { 381 for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */ 382 if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 383 } 384 PetscCall(PetscFree(link->reqs)); 385 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 386 PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 387 PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 388 } 389 } 390 PetscCall(PetscFree(link)); 391 PetscFunctionReturn(0); 392 } 393 394 PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) { 395 PetscFunctionBegin; 396 PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata)); 397 #if defined(PETSC_HAVE_NVSHMEM) 398 { 399 PetscBool use_nvshmem; 400 PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem)); 401 if (use_nvshmem) { 402 PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 403 PetscFunctionReturn(0); 404 } 405 } 406 #endif 407 PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 408 PetscFunctionReturn(0); 409 } 410 411 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 412 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 413 */ 414 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) { 415 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 416 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 417 const PetscInt *rootoffset, *leafoffset; 418 MPI_Aint disp; 419 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 420 MPI_Datatype unit = link->unit; 421 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 422 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 423 424 PetscFunctionBegin; 425 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 426 if (sf->persistent) { 427 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 428 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 429 if (direction == PETSCSF_LEAF2ROOT) { 430 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 431 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 432 cnt = rootoffset[i + 1] - rootoffset[i]; 433 PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 434 } 435 } else { /* PETSCSF_ROOT2LEAF */ 436 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 437 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 438 cnt = rootoffset[i + 1] - rootoffset[i]; 439 PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 440 } 441 } 442 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 443 } 444 445 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 446 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 447 if (direction == PETSCSF_LEAF2ROOT) { 448 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 449 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 450 cnt = leafoffset[i + 1] - leafoffset[i]; 451 PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 452 } 453 } else { /* PETSCSF_ROOT2LEAF */ 454 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 455 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 456 cnt = leafoffset[i + 1] - leafoffset[i]; 457 PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 458 } 459 } 460 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 461 } 462 } 463 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 464 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 465 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 466 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 467 PetscFunctionReturn(0); 468 } 469 470 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) { 471 PetscSFLink link, *p; 472 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 473 474 PetscFunctionBegin; 475 /* Look for types in cache */ 476 for (p = &bas->inuse; (link = *p); p = &link->next) { 477 PetscBool match; 478 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 479 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 480 switch (cmode) { 481 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 482 case PETSC_USE_POINTER: break; 483 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode"); 484 } 485 *mylink = link; 486 PetscFunctionReturn(0); 487 } 488 } 489 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack"); 490 } 491 492 PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) { 493 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 494 PetscSFLink link = *mylink; 495 496 PetscFunctionBegin; 497 link->rootdata = NULL; 498 link->leafdata = NULL; 499 link->next = bas->avail; 500 bas->avail = link; 501 *mylink = NULL; 502 PetscFunctionReturn(0); 503 } 504 505 /* Error out on unsupported overlapped communications */ 506 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) { 507 PetscSFLink link, *p; 508 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 509 PetscBool match; 510 511 PetscFunctionBegin; 512 if (PetscDefined(USE_DEBUG)) { 513 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 514 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 515 */ 516 if (rootdata || leafdata) { 517 for (p = &bas->inuse; (link = *p); p = &link->next) { 518 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 519 PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata); 520 } 521 } 522 } 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) { 527 PetscFunctionBegin; 528 if (n) PetscCall(PetscMemcpy(dst, src, n)); 529 PetscFunctionReturn(0); 530 } 531 532 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) { 533 PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0; 534 PetscBool is2Int, is2PetscInt; 535 PetscMPIInt ni, na, nd, combiner; 536 #if defined(PETSC_HAVE_COMPLEX) 537 PetscInt nPetscComplex = 0; 538 #endif 539 540 PetscFunctionBegin; 541 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar)); 542 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar)); 543 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 544 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt)); 545 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt)); 546 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal)); 547 #if defined(PETSC_HAVE_COMPLEX) 548 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex)); 549 #endif 550 PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int)); 551 PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt)); 552 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 553 PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner)); 554 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 555 link->bs = 1; /* default */ 556 557 if (is2Int) { 558 PackInit_PairType_int_int_1_1(link); 559 link->bs = 1; 560 link->unitbytes = 2 * sizeof(int); 561 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 562 link->basicunit = MPI_2INT; 563 link->unit = MPI_2INT; 564 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 565 PackInit_PairType_PetscInt_PetscInt_1_1(link); 566 link->bs = 1; 567 link->unitbytes = 2 * sizeof(PetscInt); 568 link->basicunit = MPIU_2INT; 569 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 570 link->unit = MPIU_2INT; 571 } else if (nPetscReal) { 572 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); 573 else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link); 574 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); 575 else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link); 576 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); 577 else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link); 578 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); 579 else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link); 580 link->bs = nPetscReal; 581 link->unitbytes = nPetscReal * sizeof(PetscReal); 582 link->basicunit = MPIU_REAL; 583 if (link->bs == 1) { 584 link->isbuiltin = PETSC_TRUE; 585 link->unit = MPIU_REAL; 586 } 587 } else if (nPetscInt) { 588 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); 589 else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 590 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); 591 else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 592 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); 593 else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 594 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); 595 else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 596 link->bs = nPetscInt; 597 link->unitbytes = nPetscInt * sizeof(PetscInt); 598 link->basicunit = MPIU_INT; 599 if (link->bs == 1) { 600 link->isbuiltin = PETSC_TRUE; 601 link->unit = MPIU_INT; 602 } 603 #if defined(PETSC_USE_64BIT_INDICES) 604 } else if (nInt) { 605 if (nInt == 8) PackInit_IntegerType_int_8_1(link); 606 else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link); 607 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); 608 else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link); 609 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); 610 else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link); 611 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); 612 else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link); 613 link->bs = nInt; 614 link->unitbytes = nInt * sizeof(int); 615 link->basicunit = MPI_INT; 616 if (link->bs == 1) { 617 link->isbuiltin = PETSC_TRUE; 618 link->unit = MPI_INT; 619 } 620 #endif 621 } else if (nSignedChar) { 622 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); 623 else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 624 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); 625 else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 626 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); 627 else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 628 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); 629 else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 630 link->bs = nSignedChar; 631 link->unitbytes = nSignedChar * sizeof(SignedChar); 632 link->basicunit = MPI_SIGNED_CHAR; 633 if (link->bs == 1) { 634 link->isbuiltin = PETSC_TRUE; 635 link->unit = MPI_SIGNED_CHAR; 636 } 637 } else if (nUnsignedChar) { 638 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); 639 else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 640 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); 641 else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 642 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); 643 else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 644 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); 645 else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 646 link->bs = nUnsignedChar; 647 link->unitbytes = nUnsignedChar * sizeof(UnsignedChar); 648 link->basicunit = MPI_UNSIGNED_CHAR; 649 if (link->bs == 1) { 650 link->isbuiltin = PETSC_TRUE; 651 link->unit = MPI_UNSIGNED_CHAR; 652 } 653 #if defined(PETSC_HAVE_COMPLEX) 654 } else if (nPetscComplex) { 655 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); 656 else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 657 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); 658 else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 659 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); 660 else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 661 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); 662 else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 663 link->bs = nPetscComplex; 664 link->unitbytes = nPetscComplex * sizeof(PetscComplex); 665 link->basicunit = MPIU_COMPLEX; 666 if (link->bs == 1) { 667 link->isbuiltin = PETSC_TRUE; 668 link->unit = MPIU_COMPLEX; 669 } 670 #endif 671 } else { 672 MPI_Aint lb, nbyte; 673 PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte)); 674 PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 675 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 676 if (nbyte == 4) PackInit_DumbType_char_4_1(link); 677 else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link); 678 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); 679 else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link); 680 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); 681 else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link); 682 link->bs = nbyte; 683 link->unitbytes = nbyte; 684 link->basicunit = MPI_BYTE; 685 } else { 686 nInt = nbyte / sizeof(int); 687 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); 688 else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link); 689 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); 690 else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link); 691 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); 692 else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link); 693 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); 694 else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link); 695 link->bs = nInt; 696 link->unitbytes = nbyte; 697 link->basicunit = MPI_INT; 698 } 699 if (link->isbuiltin) link->unit = unit; 700 } 701 702 if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit)); 703 704 link->Memcpy = PetscSFLinkMemcpy_Host; 705 PetscFunctionReturn(0); 706 } 707 708 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) { 709 PetscFunctionBegin; 710 *UnpackAndOp = NULL; 711 if (PetscMemTypeHost(mtype)) { 712 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 713 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 714 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 715 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 716 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 717 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 718 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 719 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 720 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 721 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 722 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 723 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 724 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 725 } 726 #if defined(PETSC_HAVE_DEVICE) 727 else if (PetscMemTypeDevice(mtype) && !atomic) { 728 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 729 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 730 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 731 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 732 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 733 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 734 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 735 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 736 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 737 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 738 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 739 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 740 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 741 } else if (PetscMemTypeDevice(mtype) && atomic) { 742 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 743 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 744 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 745 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 746 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 747 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 748 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 749 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 750 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 751 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 752 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 753 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 754 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 755 } 756 #endif 757 PetscFunctionReturn(0); 758 } 759 760 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) { 761 PetscFunctionBegin; 762 *ScatterAndOp = NULL; 763 if (PetscMemTypeHost(mtype)) { 764 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 765 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 766 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 767 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 768 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 769 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 770 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 771 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 772 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 773 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 774 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 775 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 776 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 777 } 778 #if defined(PETSC_HAVE_DEVICE) 779 else if (PetscMemTypeDevice(mtype) && !atomic) { 780 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 781 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 782 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 783 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 784 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 785 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 786 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 787 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 788 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 789 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 790 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 791 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 792 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 793 } else if (PetscMemTypeDevice(mtype) && atomic) { 794 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 795 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 796 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 797 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 798 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 799 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 800 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 801 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 802 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 803 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 804 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 805 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 806 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 807 } 808 #endif 809 PetscFunctionReturn(0); 810 } 811 812 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) { 813 PetscFunctionBegin; 814 *FetchAndOp = NULL; 815 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 816 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 817 #if defined(PETSC_HAVE_DEVICE) 818 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 819 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 820 #endif 821 PetscFunctionReturn(0); 822 } 823 824 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) { 825 PetscFunctionBegin; 826 *FetchAndOpLocal = NULL; 827 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 828 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 829 #if defined(PETSC_HAVE_DEVICE) 830 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 831 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 832 #endif 833 PetscFunctionReturn(0); 834 } 835 836 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) { 837 PetscLogDouble flops; 838 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 839 840 PetscFunctionBegin; 841 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 842 flops = bas->rootbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 843 #if defined(PETSC_HAVE_DEVICE) 844 if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); 845 else 846 #endif 847 PetscCall(PetscLogFlops(flops)); 848 } 849 PetscFunctionReturn(0); 850 } 851 852 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) { 853 PetscLogDouble flops; 854 855 PetscFunctionBegin; 856 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 857 flops = sf->leafbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 858 #if defined(PETSC_HAVE_DEVICE) 859 if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); 860 else 861 #endif 862 PetscCall(PetscLogFlops(flops)); 863 } 864 PetscFunctionReturn(0); 865 } 866 867 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 868 Input Parameters: 869 +sf - The StarForest 870 .link - The link 871 .count - Number of entries to unpack 872 .start - The first index, significent when indices=NULL 873 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 874 .buf - A contiguous buffer to unpack from 875 -op - Operation after unpack 876 877 Output Parameters: 878 .data - The data to unpack to 879 */ 880 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) { 881 PetscFunctionBegin; 882 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 883 { 884 PetscInt i; 885 if (indices) { 886 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 887 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 888 */ 889 for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op)); 890 } else { 891 PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op)); 892 } 893 } 894 PetscFunctionReturn(0); 895 #else 896 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 897 #endif 898 } 899 900 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) { 901 PetscFunctionBegin; 902 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 903 { 904 PetscInt i, disp; 905 if (!srcIdx) { 906 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op)); 907 } else { 908 for (i = 0; i < count; i++) { 909 disp = dstIdx ? dstIdx[i] : dstStart + i; 910 PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op)); 911 } 912 } 913 } 914 PetscFunctionReturn(0); 915 #else 916 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 917 #endif 918 } 919 920 /*============================================================================= 921 Pack/Unpack/Fetch/Scatter routines 922 ============================================================================*/ 923 924 /* Pack rootdata to rootbuf 925 Input Parameters: 926 + sf - The SF this packing works on. 927 . link - It gives the memtype of the roots and also provides root buffer. 928 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 929 - rootdata - Where to read the roots. 930 931 Notes: 932 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 933 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 934 */ 935 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) { 936 const PetscInt *rootindices = NULL; 937 PetscInt count, start; 938 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 939 PetscMemType rootmtype = link->rootmtype; 940 PetscSFPackOpt opt = NULL; 941 942 PetscFunctionBegin; 943 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 944 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 945 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 946 PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack)); 947 PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 948 } 949 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 950 PetscFunctionReturn(0); 951 } 952 953 /* Pack leafdata to leafbuf */ 954 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) { 955 const PetscInt *leafindices = NULL; 956 PetscInt count, start; 957 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 958 PetscMemType leafmtype = link->leafmtype; 959 PetscSFPackOpt opt = NULL; 960 961 PetscFunctionBegin; 962 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 963 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 964 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 965 PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack)); 966 PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 967 } 968 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 969 PetscFunctionReturn(0); 970 } 971 972 /* Pack rootdata to rootbuf, which are in the same memory space */ 973 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) { 974 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 975 976 PetscFunctionBegin; 977 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 978 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 979 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 980 } 981 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 982 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata)); 983 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 984 PetscFunctionReturn(0); 985 } 986 /* Pack leafdata to leafbuf, which are in the same memory space */ 987 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) { 988 PetscFunctionBegin; 989 if (scope == PETSCSF_REMOTE) { 990 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 991 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 992 } 993 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 994 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata)); 995 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 996 PetscFunctionReturn(0); 997 } 998 999 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) { 1000 const PetscInt *rootindices = NULL; 1001 PetscInt count, start; 1002 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1003 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1004 PetscMemType rootmtype = link->rootmtype; 1005 PetscSFPackOpt opt = NULL; 1006 1007 PetscFunctionBegin; 1008 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1009 PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp)); 1010 if (UnpackAndOp) { 1011 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 1012 PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 1013 } else { 1014 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices)); 1015 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op)); 1016 } 1017 } 1018 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op)); 1019 PetscFunctionReturn(0); 1020 } 1021 1022 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) { 1023 const PetscInt *leafindices = NULL; 1024 PetscInt count, start; 1025 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1026 PetscMemType leafmtype = link->leafmtype; 1027 PetscSFPackOpt opt = NULL; 1028 1029 PetscFunctionBegin; 1030 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1031 PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp)); 1032 if (UnpackAndOp) { 1033 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 1034 PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1035 } else { 1036 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices)); 1037 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op)); 1038 } 1039 } 1040 PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op)); 1041 PetscFunctionReturn(0); 1042 } 1043 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1044 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) { 1045 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1046 1047 PetscFunctionBegin; 1048 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1049 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op)); 1050 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1051 if (scope == PETSCSF_REMOTE) { 1052 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1053 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1054 } 1055 PetscFunctionReturn(0); 1056 } 1057 1058 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1059 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) { 1060 PetscFunctionBegin; 1061 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1062 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op)); 1063 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1064 if (scope == PETSCSF_REMOTE) { 1065 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1066 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1067 } 1068 PetscFunctionReturn(0); 1069 } 1070 1071 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1072 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) { 1073 const PetscInt *rootindices = NULL; 1074 PetscInt count, start; 1075 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1076 PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL; 1077 PetscMemType rootmtype = link->rootmtype; 1078 PetscSFPackOpt opt = NULL; 1079 1080 PetscFunctionBegin; 1081 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1082 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1083 /* Do FetchAndOp on rootdata with rootbuf */ 1084 PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp)); 1085 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices)); 1086 PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1087 } 1088 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op)); 1089 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1090 PetscFunctionReturn(0); 1091 } 1092 1093 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) { 1094 const PetscInt *rootindices = NULL, *leafindices = NULL; 1095 PetscInt count, rootstart, leafstart; 1096 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1097 PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL; 1098 PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype; 1099 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1100 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1101 char *srcbuf = NULL, *dstbuf = NULL; 1102 PetscBool dstdups; 1103 1104 PetscFunctionBegin; 1105 if (!buflen) PetscFunctionReturn(0); 1106 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1107 if (direction == PETSCSF_ROOT2LEAF) { 1108 PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata)); 1109 srcmtype = rootmtype; 1110 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1111 dstmtype = leafmtype; 1112 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1113 } else { 1114 PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata)); 1115 srcmtype = leafmtype; 1116 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1117 dstmtype = rootmtype; 1118 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1119 } 1120 PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes)); 1121 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1122 if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 1123 if (direction == PETSCSF_ROOT2LEAF) { 1124 PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op)); 1125 } else { 1126 PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op)); 1127 } 1128 } else { 1129 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1130 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1131 PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp)); 1132 if (ScatterAndOp) { 1133 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1134 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1135 if (direction == PETSCSF_ROOT2LEAF) { 1136 PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata)); 1137 } else { 1138 PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata)); 1139 } 1140 } else { 1141 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1142 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1143 if (direction == PETSCSF_ROOT2LEAF) { 1144 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op)); 1145 } else { 1146 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op)); 1147 } 1148 } 1149 } 1150 PetscFunctionReturn(0); 1151 } 1152 1153 /* Fetch rootdata to leafdata and leafupdate locally */ 1154 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) { 1155 const PetscInt *rootindices = NULL, *leafindices = NULL; 1156 PetscInt count, rootstart, leafstart; 1157 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1158 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1159 const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype; 1160 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1161 1162 PetscFunctionBegin; 1163 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1164 if (rootmtype != leafmtype) { 1165 /* The local communication has to go through pack and unpack */ 1166 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1167 } else { 1168 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1169 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1170 PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal)); 1171 PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate)); 1172 } 1173 PetscFunctionReturn(0); 1174 } 1175 1176 /* 1177 Create per-rank pack/unpack optimizations based on indice patterns 1178 1179 Input Parameters: 1180 + n - Number of destination ranks 1181 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1182 - idx - [*] Array storing indices 1183 1184 Output Parameters: 1185 + opt - Pack optimizations. NULL if no optimizations. 1186 */ 1187 PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) { 1188 PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y; 1189 PetscBool optimizable = PETSC_TRUE; 1190 PetscSFPackOpt opt; 1191 1192 PetscFunctionBegin; 1193 PetscCall(PetscMalloc1(1, &opt)); 1194 PetscCall(PetscMalloc1(7 * n + 2, &opt->array)); 1195 opt->n = opt->array[0] = n; 1196 opt->offset = opt->array + 1; 1197 opt->start = opt->array + n + 2; 1198 opt->dx = opt->array + 2 * n + 2; 1199 opt->dy = opt->array + 3 * n + 2; 1200 opt->dz = opt->array + 4 * n + 2; 1201 opt->X = opt->array + 5 * n + 2; 1202 opt->Y = opt->array + 6 * n + 2; 1203 1204 for (r = 0; r < n; r++) { /* For each destination rank */ 1205 m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1206 p = offset[r]; 1207 start = idx[p]; /* First index for this rank */ 1208 p++; 1209 1210 /* Search in X dimension */ 1211 for (dx = 1; dx < m; dx++, p++) { 1212 if (start + dx != idx[p]) break; 1213 } 1214 1215 dydz = m / dx; 1216 X = dydz > 1 ? (idx[p] - start) : dx; 1217 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1218 if (m % dx || X <= 0) { 1219 optimizable = PETSC_FALSE; 1220 goto finish; 1221 } 1222 for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */ 1223 for (i = 0; i < dx; i++, p++) { 1224 if (start + X * dy + i != idx[p]) { 1225 if (i) { 1226 optimizable = PETSC_FALSE; 1227 goto finish; 1228 } /* The pattern is violated in the middle of an x-walk */ 1229 else 1230 goto Z_dimension; 1231 } 1232 } 1233 } 1234 1235 Z_dimension: 1236 dz = m / (dx * dy); 1237 Y = dz > 1 ? (idx[p] - start) / X : dy; 1238 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1239 if (m % (dx * dy) || Y <= 0) { 1240 optimizable = PETSC_FALSE; 1241 goto finish; 1242 } 1243 for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1244 for (j = 0; j < dy; j++) { 1245 for (i = 0; i < dx; i++, p++) { 1246 if (start + X * Y * k + X * j + i != idx[p]) { 1247 optimizable = PETSC_FALSE; 1248 goto finish; 1249 } 1250 } 1251 } 1252 } 1253 opt->start[r] = start; 1254 opt->dx[r] = dx; 1255 opt->dy[r] = dy; 1256 opt->dz[r] = dz; 1257 opt->X[r] = X; 1258 opt->Y[r] = Y; 1259 } 1260 1261 finish: 1262 /* If not optimizable, free arrays to save memory */ 1263 if (!n || !optimizable) { 1264 PetscCall(PetscFree(opt->array)); 1265 PetscCall(PetscFree(opt)); 1266 *out = NULL; 1267 } else { 1268 opt->offset[0] = 0; 1269 for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r]; 1270 *out = opt; 1271 } 1272 PetscFunctionReturn(0); 1273 } 1274 1275 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) { 1276 PetscSFPackOpt opt = *out; 1277 1278 PetscFunctionBegin; 1279 if (opt) { 1280 PetscCall(PetscSFFree(sf, mtype, opt->array)); 1281 PetscCall(PetscFree(opt)); 1282 *out = NULL; 1283 } 1284 PetscFunctionReturn(0); 1285 } 1286 1287 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) { 1288 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1289 PetscInt i, j; 1290 1291 PetscFunctionBegin; 1292 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1293 for (i = 0; i < 2; i++) { /* Set defaults */ 1294 sf->leafstart[i] = 0; 1295 sf->leafcontig[i] = PETSC_TRUE; 1296 sf->leafdups[i] = PETSC_FALSE; 1297 bas->rootstart[i] = 0; 1298 bas->rootcontig[i] = PETSC_TRUE; 1299 bas->rootdups[i] = PETSC_FALSE; 1300 } 1301 1302 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1303 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1304 1305 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1306 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1307 1308 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1309 for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */ 1310 if (sf->rmine[i] != sf->leafstart[0] + i) { 1311 sf->leafcontig[0] = PETSC_FALSE; 1312 break; 1313 } 1314 } 1315 for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */ 1316 if (sf->rmine[i] != sf->leafstart[1] + j) { 1317 sf->leafcontig[1] = PETSC_FALSE; 1318 break; 1319 } 1320 } 1321 1322 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1323 if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 1324 if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1325 1326 /* Are root indices for self and remote contiguous? */ 1327 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1328 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1329 1330 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1331 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1332 1333 for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) { 1334 if (bas->irootloc[i] != bas->rootstart[0] + i) { 1335 bas->rootcontig[0] = PETSC_FALSE; 1336 break; 1337 } 1338 } 1339 for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) { 1340 if (bas->irootloc[i] != bas->rootstart[1] + j) { 1341 bas->rootcontig[1] = PETSC_FALSE; 1342 break; 1343 } 1344 } 1345 1346 if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 1347 if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1348 1349 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1350 if (PetscDefined(HAVE_DEVICE)) { 1351 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1352 if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 1353 if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1])); 1354 if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 1355 if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1356 } 1357 PetscFunctionReturn(0); 1358 } 1359 1360 PetscErrorCode PetscSFResetPackFields(PetscSF sf) { 1361 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1362 PetscInt i; 1363 1364 PetscFunctionBegin; 1365 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 1366 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i])); 1367 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i])); 1368 #if defined(PETSC_HAVE_DEVICE) 1369 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i])); 1370 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i])); 1371 #endif 1372 } 1373 PetscFunctionReturn(0); 1374 } 1375