1 #include <petsc/private/sfimpl.h> 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op, s, t) \ 16 do { \ 17 (s) = (s)op(t); \ 18 } while (0) /* binary ops in the middle such as +, *, && etc. */ 19 #define OP_FUNCTION(op, s, t) \ 20 do { \ 21 (s) = op((s), (t)); \ 22 } while (0) /* ops like a function, such as PetscMax, PetscMin */ 23 #define OP_LXOR(op, s, t) \ 24 do { \ 25 (s) = (!(s)) != (!(t)); \ 26 } while (0) /* logical exclusive OR */ 27 #define OP_ASSIGN(op, s, t) \ 28 do { \ 29 (s) = (t); \ 30 } while (0) 31 /* Ref MPI MAXLOC */ 32 #define OP_XLOC(op, s, t) \ 33 do { \ 34 if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \ 35 else if (!((s).u op(t).u)) s = t; \ 36 } while (0) 37 38 /* DEF_PackFunc - macro defining a Pack routine 39 40 Arguments of the macro: 41 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 42 .BS Block size for vectorization. It is a factor of bsz. 43 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 44 45 Arguments of the Pack routine: 46 +count Number of indices in idx[]. 47 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 48 .opt Per-pack optimization plan. NULL means no such plan. 49 .idx Indices of entries to packed. 50 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 51 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 52 -packed Address of the packed data. 53 */ 54 #define DEF_PackFunc(Type, BS, EQ) \ 55 static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \ 56 { \ 57 const Type *u = (const Type *)unpacked, *u2; \ 58 Type *p = (Type *)packed, *p2; \ 59 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 60 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 61 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 62 PetscFunctionBegin; \ 63 if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \ 64 else if (opt) { /* has optimizations available */ p2 = p; \ 65 for (r = 0; r < opt->n; r++) { \ 66 u2 = u + opt->start[r] * MBS; \ 67 X = opt->X[r]; \ 68 Y = opt->Y[r]; \ 69 for (k = 0; k < opt->dz[r]; k++) \ 70 for (j = 0; j < opt->dy[r]; j++) { \ 71 PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \ 72 p2 += opt->dx[r] * MBS; \ 73 } \ 74 } \ 75 } else { \ 76 for (i = 0; i < count; i++) \ 77 for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 78 for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 79 p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \ 80 } \ 81 PetscFunctionReturn(PETSC_SUCCESS); \ 82 } 83 84 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 85 and inserts into a sparse array. 86 87 Arguments: 88 .Type Type of the data 89 .BS Block size for vectorization 90 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 91 92 Notes: 93 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 94 */ 95 #define DEF_UnpackFunc(Type, BS, EQ) \ 96 static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \ 97 { \ 98 Type *u = (Type *)unpacked, *u2; \ 99 const Type *p = (const Type *)packed; \ 100 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 101 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 102 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 103 PetscFunctionBegin; \ 104 if (!idx) { \ 105 u += start * MBS; \ 106 if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \ 107 } else if (opt) { /* has optimizations available */ \ 108 for (r = 0; r < opt->n; r++) { \ 109 u2 = u + opt->start[r] * MBS; \ 110 X = opt->X[r]; \ 111 Y = opt->Y[r]; \ 112 for (k = 0; k < opt->dz[r]; k++) \ 113 for (j = 0; j < opt->dy[r]; j++) { \ 114 PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \ 115 p += opt->dx[r] * MBS; \ 116 } \ 117 } \ 118 } else { \ 119 for (i = 0; i < count; i++) \ 120 for (j = 0; j < M; j++) \ 121 for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \ 122 } \ 123 PetscFunctionReturn(PETSC_SUCCESS); \ 124 } 125 126 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 127 128 Arguments: 129 +Opname Name of the Op, such as Add, Mult, LAND, etc. 130 .Type Type of the data 131 .BS Block size for vectorization 132 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 133 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 134 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 135 */ 136 #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 137 static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \ 138 { \ 139 Type *u = (Type *)unpacked, *u2; \ 140 const Type *p = (const Type *)packed; \ 141 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 142 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 143 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 144 PetscFunctionBegin; \ 145 if (!idx) { \ 146 u += start * MBS; \ 147 for (i = 0; i < count; i++) \ 148 for (j = 0; j < M; j++) \ 149 for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r = 0; r < opt->n; r++) { \ 152 u2 = u + opt->start[r] * MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k = 0; k < opt->dz[r]; k++) \ 156 for (j = 0; j < opt->dy[r]; j++) { \ 157 for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \ 158 p += opt->dx[r] * MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i = 0; i < count; i++) \ 163 for (j = 0; j < M; j++) \ 164 for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 165 } \ 166 PetscFunctionReturn(PETSC_SUCCESS); \ 167 } 168 169 #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 170 static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \ 171 { \ 172 Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \ 173 PetscInt i, j, k, r, l, bs = link->bs; \ 174 const PetscInt M = (EQ) ? 1 : bs / BS; \ 175 const PetscInt MBS = M * BS; \ 176 PetscFunctionBegin; \ 177 for (i = 0; i < count; i++) { \ 178 r = (!idx ? start + i : idx[i]) * MBS; \ 179 l = i * MBS; \ 180 for (j = 0; j < M; j++) \ 181 for (k = 0; k < BS; k++) { \ 182 tmp = u[r + j * BS + k]; \ 183 OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \ 184 p[l + j * BS + k] = tmp; \ 185 } \ 186 } \ 187 PetscFunctionReturn(PETSC_SUCCESS); \ 188 } 189 190 #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 191 static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \ 192 { \ 193 const Type *u = (const Type *)src; \ 194 Type *v = (Type *)dst; \ 195 PetscInt i, j, k, s, t, X, Y, bs = link->bs; \ 196 const PetscInt M = (EQ) ? 1 : bs / BS; \ 197 const PetscInt MBS = M * BS; \ 198 PetscFunctionBegin; \ 199 if (!srcIdx) { /* src is contiguous */ \ 200 u += srcStart * MBS; \ 201 PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \ 202 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 203 u += srcOpt->start[0] * MBS; \ 204 v += dstStart * MBS; \ 205 X = srcOpt->X[0]; \ 206 Y = srcOpt->Y[0]; \ 207 for (k = 0; k < srcOpt->dz[0]; k++) \ 208 for (j = 0; j < srcOpt->dy[0]; j++) { \ 209 for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \ 210 v += srcOpt->dx[0] * MBS; \ 211 } \ 212 } else { /* all other cases */ \ 213 for (i = 0; i < count; i++) { \ 214 s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \ 215 t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \ 216 for (j = 0; j < M; j++) \ 217 for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \ 218 } \ 219 } \ 220 PetscFunctionReturn(PETSC_SUCCESS); \ 221 } 222 223 #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \ 224 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \ 225 { \ 226 Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \ 227 const Type *ldata = (const Type *)leafdata; \ 228 PetscInt i, j, k, r, l, bs = link->bs; \ 229 const PetscInt M = (EQ) ? 1 : bs / BS; \ 230 const PetscInt MBS = M * BS; \ 231 PetscFunctionBegin; \ 232 for (i = 0; i < count; i++) { \ 233 r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \ 234 l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \ 235 for (j = 0; j < M; j++) \ 236 for (k = 0; k < BS; k++) { \ 237 lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \ 238 OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \ 239 } \ 240 } \ 241 PetscFunctionReturn(PETSC_SUCCESS); \ 242 } 243 244 /* Pack, Unpack/Fetch ops */ 245 #define DEF_Pack(Type, BS, EQ) \ 246 DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \ 247 { \ 248 link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \ 249 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \ 250 link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \ 251 } 252 253 /* Add, Mult ops */ 254 #define DEF_Add(Type, BS, EQ) \ 255 DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \ 256 { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type, BS, EQ) \ 267 DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \ 268 { \ 269 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 270 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 271 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 272 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 273 } 274 275 /* Logical ops. 276 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 277 the compilation warning "empty macro arguments are undefined in ISO C90" 278 */ 279 #define DEF_Log(Type, BS, EQ) \ 280 DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \ 281 { \ 282 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \ 283 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \ 284 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \ 285 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \ 286 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \ 287 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \ 288 } 289 290 /* Bitwise ops */ 291 #define DEF_Bit(Type, BS, EQ) \ 292 DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \ 293 { \ 294 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \ 295 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \ 296 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \ 297 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \ 298 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \ 299 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \ 300 } 301 302 /* Maxloc, Minloc ops */ 303 #define DEF_Xloc(Type, BS, EQ) \ 304 DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \ 305 { \ 306 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 307 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 308 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 309 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 310 } 311 312 #define DEF_IntegerType(Type, BS, EQ) \ 313 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \ 314 { \ 315 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 316 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 317 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 318 CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \ 319 CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \ 320 } 321 322 #define DEF_RealType(Type, BS, EQ) \ 323 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \ 324 { \ 325 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 326 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 327 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 328 } 329 330 #if defined(PETSC_HAVE_COMPLEX) 331 #define DEF_ComplexType(Type, BS, EQ) \ 332 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \ 333 { \ 334 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 335 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 336 } 337 #endif 338 339 #define DEF_DumbType(Type, BS, EQ) \ 340 DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \ 341 { \ 342 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 343 } 344 345 /* Maxloc, Minloc */ 346 #define DEF_PairType(Type, BS, EQ) \ 347 DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \ 348 { \ 349 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 350 CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \ 351 } 352 353 DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */ 354 DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */ 355 DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */ 356 DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */ 357 DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */ 358 DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */ 359 DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */ 360 DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 361 362 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 363 DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0) 364 #endif 365 366 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 367 typedef signed char SignedChar; 368 DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0) 369 370 typedef unsigned char UnsignedChar; 371 DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0) 372 373 DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0) 374 #if defined(PETSC_HAVE_COMPLEX) 375 DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0) 376 #endif 377 378 #define PairType(Type1, Type2) Type1##_##Type2 379 typedef struct { 380 int u; 381 int i; 382 } PairType(int, int); 383 typedef struct { 384 PetscInt u; 385 PetscInt i; 386 } PairType(PetscInt, PetscInt); 387 DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1) 388 389 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 390 DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0) 391 392 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 393 DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0) 394 395 PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) 396 { 397 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 398 PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8; 399 400 PetscFunctionBegin; 401 /* Destroy device-specific fields */ 402 if (link->deviceinited) PetscCall((*link->Destroy)(sf, link)); 403 404 /* Destroy host related fields */ 405 if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 406 if (!link->use_nvshmem) { 407 for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */ 408 if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 409 } 410 PetscCall(PetscFree(link->reqs)); 411 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 412 PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 413 PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 414 } 415 } 416 PetscCall(PetscFree(link)); 417 PetscFunctionReturn(PETSC_SUCCESS); 418 } 419 420 PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 421 { 422 PetscFunctionBegin; 423 PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata)); 424 #if defined(PETSC_HAVE_NVSHMEM) 425 { 426 PetscBool use_nvshmem; 427 PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem)); 428 if (use_nvshmem) { 429 PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 } 433 #endif 434 PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 435 PetscFunctionReturn(PETSC_SUCCESS); 436 } 437 438 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) 439 { 440 PetscSFLink link, *p; 441 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 442 443 PetscFunctionBegin; 444 /* Look for types in cache */ 445 for (p = &bas->inuse; (link = *p); p = &link->next) { 446 PetscBool match; 447 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 448 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 449 switch (cmode) { 450 case PETSC_OWN_POINTER: 451 *p = link->next; 452 break; /* Remove from inuse list */ 453 case PETSC_USE_POINTER: 454 break; 455 default: 456 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode"); 457 } 458 *mylink = link; 459 PetscFunctionReturn(PETSC_SUCCESS); 460 } 461 } 462 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack"); 463 } 464 465 PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) 466 { 467 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 468 PetscSFLink link = *mylink; 469 470 PetscFunctionBegin; 471 link->rootdata = NULL; 472 link->leafdata = NULL; 473 link->next = bas->avail; 474 bas->avail = link; 475 *mylink = NULL; 476 PetscFunctionReturn(PETSC_SUCCESS); 477 } 478 479 /* Error out on unsupported overlapped communications */ 480 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) 481 { 482 PetscSFLink link, *p; 483 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 484 PetscBool match; 485 486 PetscFunctionBegin; 487 if (PetscDefined(USE_DEBUG)) { 488 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 489 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 490 */ 491 if (rootdata || leafdata) { 492 for (p = &bas->inuse; (link = *p); p = &link->next) { 493 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 494 PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata); 495 } 496 } 497 } 498 PetscFunctionReturn(PETSC_SUCCESS); 499 } 500 501 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) 502 { 503 PetscFunctionBegin; 504 if (n) PetscCall(PetscMemcpy(dst, src, n)); 505 PetscFunctionReturn(PETSC_SUCCESS); 506 } 507 508 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) 509 { 510 PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0; 511 PetscBool is2Int, is2PetscInt; 512 MPIU_Count ni, na, nc, nd; 513 PetscMPIInt combiner; 514 #if defined(PETSC_HAVE_COMPLEX) 515 PetscInt nPetscComplex = 0; 516 #endif 517 518 PetscFunctionBegin; 519 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar)); 520 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar)); 521 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 522 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt)); 523 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt)); 524 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal)); 525 #if defined(PETSC_HAVE_COMPLEX) 526 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex)); 527 #endif 528 PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int)); 529 PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt)); 530 /* TODO: should we also handle Fortran MPI_2REAL? */ 531 PetscCallMPI(MPIPetsc_Type_get_envelope(unit, &ni, &na, &nc, &nd, &combiner)); 532 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 533 link->bs = 1; /* default */ 534 535 if (is2Int) { 536 PackInit_PairType_int_int_1_1(link); 537 link->bs = 1; 538 link->unitbytes = 2 * sizeof(int); 539 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 540 link->basicunit = MPI_2INT; 541 link->unit = MPI_2INT; 542 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 543 PackInit_PairType_PetscInt_PetscInt_1_1(link); 544 link->bs = 1; 545 link->unitbytes = 2 * sizeof(PetscInt); 546 link->basicunit = MPIU_2INT; 547 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 548 link->unit = MPIU_2INT; 549 } else if (nPetscReal) { 550 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); 551 else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link); 552 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); 553 else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link); 554 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); 555 else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link); 556 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); 557 else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link); 558 link->bs = nPetscReal; 559 link->unitbytes = nPetscReal * sizeof(PetscReal); 560 link->basicunit = MPIU_REAL; 561 if (link->bs == 1) { 562 link->isbuiltin = PETSC_TRUE; 563 link->unit = MPIU_REAL; 564 } 565 } else if (nPetscInt) { 566 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); 567 else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 568 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); 569 else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 570 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); 571 else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 572 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); 573 else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 574 link->bs = nPetscInt; 575 link->unitbytes = nPetscInt * sizeof(PetscInt); 576 link->basicunit = MPIU_INT; 577 if (link->bs == 1) { 578 link->isbuiltin = PETSC_TRUE; 579 link->unit = MPIU_INT; 580 } 581 #if defined(PETSC_USE_64BIT_INDICES) 582 } else if (nInt) { 583 if (nInt == 8) PackInit_IntegerType_int_8_1(link); 584 else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link); 585 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); 586 else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link); 587 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); 588 else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link); 589 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); 590 else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link); 591 link->bs = nInt; 592 link->unitbytes = nInt * sizeof(int); 593 link->basicunit = MPI_INT; 594 if (link->bs == 1) { 595 link->isbuiltin = PETSC_TRUE; 596 link->unit = MPI_INT; 597 } 598 #endif 599 } else if (nSignedChar) { 600 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); 601 else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 602 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); 603 else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 604 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); 605 else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 606 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); 607 else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 608 link->bs = nSignedChar; 609 link->unitbytes = nSignedChar * sizeof(SignedChar); 610 link->basicunit = MPI_SIGNED_CHAR; 611 if (link->bs == 1) { 612 link->isbuiltin = PETSC_TRUE; 613 link->unit = MPI_SIGNED_CHAR; 614 } 615 } else if (nUnsignedChar) { 616 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); 617 else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 618 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); 619 else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 620 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); 621 else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 622 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); 623 else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 624 link->bs = nUnsignedChar; 625 link->unitbytes = nUnsignedChar * sizeof(UnsignedChar); 626 link->basicunit = MPI_UNSIGNED_CHAR; 627 if (link->bs == 1) { 628 link->isbuiltin = PETSC_TRUE; 629 link->unit = MPI_UNSIGNED_CHAR; 630 } 631 #if defined(PETSC_HAVE_COMPLEX) 632 } else if (nPetscComplex) { 633 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); 634 else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 635 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); 636 else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 637 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); 638 else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 639 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); 640 else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 641 link->bs = nPetscComplex; 642 link->unitbytes = nPetscComplex * sizeof(PetscComplex); 643 link->basicunit = MPIU_COMPLEX; 644 if (link->bs == 1) { 645 link->isbuiltin = PETSC_TRUE; 646 link->unit = MPIU_COMPLEX; 647 } 648 #endif 649 } else { 650 MPI_Aint lb, nbyte; 651 652 PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte)); 653 PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 654 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 655 if (nbyte == 4) PackInit_DumbType_char_4_1(link); 656 else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link); 657 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); 658 else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link); 659 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); 660 else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link); 661 PetscCall(PetscIntCast(nbyte, &link->bs)); 662 link->unitbytes = nbyte; 663 link->basicunit = MPI_BYTE; 664 } else { 665 PetscCall(PetscIntCast(nbyte / sizeof(int), &nInt)); 666 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); 667 else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link); 668 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); 669 else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link); 670 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); 671 else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link); 672 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); 673 else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link); 674 link->bs = nInt; 675 link->unitbytes = nbyte; 676 link->basicunit = MPI_INT; 677 } 678 if (link->isbuiltin) link->unit = unit; 679 } 680 681 if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit)); 682 683 link->Memcpy = PetscSFLinkMemcpy_Host; 684 PetscFunctionReturn(PETSC_SUCCESS); 685 } 686 687 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) 688 { 689 PetscFunctionBegin; 690 *UnpackAndOp = NULL; 691 if (PetscMemTypeHost(mtype)) { 692 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 693 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 694 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 695 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 696 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 697 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 698 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 699 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 700 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 701 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 702 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 703 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 704 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 705 } 706 #if defined(PETSC_HAVE_DEVICE) 707 else if (PetscMemTypeDevice(mtype) && !atomic) { 708 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 709 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 710 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 711 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 712 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 713 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 714 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 715 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 716 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 717 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 718 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 719 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 720 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 721 } else if (PetscMemTypeDevice(mtype) && atomic) { 722 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 723 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 724 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 725 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 726 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 727 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 728 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 729 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 730 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 731 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 732 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 733 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 734 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 735 } 736 #endif 737 PetscFunctionReturn(PETSC_SUCCESS); 738 } 739 740 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) 741 { 742 PetscFunctionBegin; 743 *ScatterAndOp = NULL; 744 if (PetscMemTypeHost(mtype)) { 745 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 746 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 747 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 748 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 749 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 750 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 751 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 752 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 753 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 754 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 755 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 756 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 757 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 758 } 759 #if defined(PETSC_HAVE_DEVICE) 760 else if (PetscMemTypeDevice(mtype) && !atomic) { 761 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 762 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 763 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 764 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 765 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 766 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 767 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 768 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 769 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 770 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 771 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 772 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 773 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 774 } else if (PetscMemTypeDevice(mtype) && atomic) { 775 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 776 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 777 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 778 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 779 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 780 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 781 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 782 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 783 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 784 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 785 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 786 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 787 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 788 } 789 #endif 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 793 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) 794 { 795 PetscFunctionBegin; 796 *FetchAndOp = NULL; 797 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 798 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 799 #if defined(PETSC_HAVE_DEVICE) 800 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 801 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 802 #endif 803 PetscFunctionReturn(PETSC_SUCCESS); 804 } 805 806 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) 807 { 808 PetscFunctionBegin; 809 *FetchAndOpLocal = NULL; 810 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 811 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 812 #if defined(PETSC_HAVE_DEVICE) 813 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 814 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 815 #endif 816 PetscFunctionReturn(PETSC_SUCCESS); 817 } 818 819 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) 820 { 821 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 822 823 PetscFunctionBegin; 824 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 825 #if defined(PETSC_HAVE_DEVICE) 826 if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(bas->rootbuflen[scope] * link->bs)); 827 else 828 #endif 829 PetscCall(PetscLogFlops(bas->rootbuflen[scope] * link->bs)); /* # of roots in buffer x # of scalars in unit */ 830 } 831 PetscFunctionReturn(PETSC_SUCCESS); 832 } 833 834 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) 835 { 836 PetscFunctionBegin; 837 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 838 #if defined(PETSC_HAVE_DEVICE) 839 if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(sf->leafbuflen[scope] * link->bs)); /* # of roots in buffer x # of scalars in unit */ 840 else 841 #endif 842 PetscCall(PetscLogFlops(sf->leafbuflen[scope] * link->bs)); 843 } 844 PetscFunctionReturn(PETSC_SUCCESS); 845 } 846 847 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 848 Input Parameters: 849 +sf - The StarForest 850 .link - The link 851 .count - Number of entries to unpack 852 .start - The first index, significant when indices=NULL 853 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 854 .buf - A contiguous buffer to unpack from 855 -op - Operation after unpack 856 857 Output Parameters: 858 .data - The data to unpack to 859 */ 860 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) 861 { 862 PetscFunctionBegin; 863 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 864 { 865 PetscInt i; 866 if (indices) { 867 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 868 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 869 */ 870 for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op)); 871 } else { 872 PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op)); 873 } 874 } 875 PetscFunctionReturn(PETSC_SUCCESS); 876 #else 877 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 878 #endif 879 } 880 881 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) 882 { 883 PetscFunctionBegin; 884 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 885 { 886 PetscInt i, disp; 887 if (!srcIdx) { 888 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op)); 889 } else { 890 for (i = 0; i < count; i++) { 891 disp = dstIdx ? dstIdx[i] : dstStart + i; 892 PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op)); 893 } 894 } 895 } 896 PetscFunctionReturn(PETSC_SUCCESS); 897 #else 898 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 899 #endif 900 } 901 902 /*============================================================================= 903 Pack/Unpack/Fetch/Scatter routines 904 ============================================================================*/ 905 906 /* Pack rootdata to rootbuf 907 Input Parameters: 908 + sf - The SF this packing works on. 909 . link - It gives the memtype of the roots and also provides root buffer. 910 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 911 - rootdata - Where to read the roots. 912 913 Notes: 914 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 915 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 916 */ 917 static PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) 918 { 919 const PetscInt *rootindices = NULL; 920 PetscInt count, start; 921 PetscMemType rootmtype = link->rootmtype; 922 PetscSFPackOpt opt = NULL; 923 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 924 925 PetscFunctionBegin; 926 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 927 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 928 PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack)); 929 PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 930 } 931 PetscFunctionReturn(PETSC_SUCCESS); 932 } 933 934 /* Pack leafdata to leafbuf */ 935 static PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) 936 { 937 const PetscInt *leafindices = NULL; 938 PetscInt count, start; 939 PetscMemType leafmtype = link->leafmtype; 940 PetscSFPackOpt opt = NULL; 941 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 942 943 PetscFunctionBegin; 944 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 945 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 946 PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack)); 947 PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 948 } 949 PetscFunctionReturn(PETSC_SUCCESS); 950 } 951 952 /* Pack rootdata to rootbuf, which are in the same memory space */ 953 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) 954 { 955 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 956 957 PetscFunctionBegin; 958 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 959 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 960 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 961 } 962 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 963 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata)); 964 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 965 PetscFunctionReturn(PETSC_SUCCESS); 966 } 967 /* Pack leafdata to leafbuf, which are in the same memory space */ 968 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) 969 { 970 PetscFunctionBegin; 971 if (scope == PETSCSF_REMOTE) { 972 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 973 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 974 } 975 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 976 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata)); 977 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 978 PetscFunctionReturn(PETSC_SUCCESS); 979 } 980 981 static PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) 982 { 983 const PetscInt *rootindices = NULL; 984 PetscInt count, start; 985 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 986 PetscMemType rootmtype = link->rootmtype; 987 PetscSFPackOpt opt = NULL; 988 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 989 990 PetscFunctionBegin; 991 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 992 PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp)); 993 if (UnpackAndOp) { 994 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 995 PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 996 } else { 997 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices)); 998 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op)); 999 } 1000 } 1001 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op)); 1002 PetscFunctionReturn(PETSC_SUCCESS); 1003 } 1004 1005 static PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) 1006 { 1007 const PetscInt *leafindices = NULL; 1008 PetscInt count, start; 1009 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1010 PetscMemType leafmtype = link->leafmtype; 1011 PetscSFPackOpt opt = NULL; 1012 1013 PetscFunctionBegin; 1014 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1015 PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp)); 1016 if (UnpackAndOp) { 1017 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 1018 PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1019 } else { 1020 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices)); 1021 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op)); 1022 } 1023 } 1024 PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op)); 1025 PetscFunctionReturn(PETSC_SUCCESS); 1026 } 1027 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1028 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) 1029 { 1030 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1031 1032 PetscFunctionBegin; 1033 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); // call it even no data is unpacked so that -log_sync can be done collectively 1034 if (bas->rootbuflen[scope] && !link->rootdirect[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op)); 1035 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1036 if (scope == PETSCSF_REMOTE) { 1037 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1038 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1039 } 1040 PetscFunctionReturn(PETSC_SUCCESS); 1041 } 1042 1043 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1044 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) 1045 { 1046 PetscFunctionBegin; 1047 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1048 if (sf->leafbuflen[scope] && !link->leafdirect[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op)); 1049 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1050 if (scope == PETSCSF_REMOTE) { 1051 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1052 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1053 } 1054 PetscFunctionReturn(PETSC_SUCCESS); 1055 } 1056 1057 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1058 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) 1059 { 1060 const PetscInt *rootindices = NULL; 1061 PetscInt count, start; 1062 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1063 PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL; 1064 PetscMemType rootmtype = link->rootmtype; 1065 PetscSFPackOpt opt = NULL; 1066 1067 PetscFunctionBegin; 1068 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1069 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1070 /* Do FetchAndOp on rootdata with rootbuf */ 1071 PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp)); 1072 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices)); 1073 PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1074 } 1075 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op)); 1076 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1077 PetscFunctionReturn(PETSC_SUCCESS); 1078 } 1079 1080 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) 1081 { 1082 const PetscInt *rootindices = NULL, *leafindices = NULL; 1083 PetscInt count, rootstart, leafstart; 1084 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1085 PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype; 1086 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1087 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1088 char *srcbuf = NULL, *dstbuf = NULL; 1089 PetscBool dstdups; 1090 PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL; 1091 1092 PetscFunctionBegin; 1093 if (!buflen) PetscFunctionReturn(PETSC_SUCCESS); 1094 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1095 if (direction == PETSCSF_ROOT2LEAF) { 1096 PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata)); 1097 srcmtype = rootmtype; 1098 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1099 dstmtype = leafmtype; 1100 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1101 } else { 1102 PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata)); 1103 srcmtype = leafmtype; 1104 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1105 dstmtype = rootmtype; 1106 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1107 } 1108 PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes)); 1109 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1110 if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 1111 if (direction == PETSCSF_ROOT2LEAF) { 1112 PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op)); 1113 } else { 1114 PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op)); 1115 } 1116 } else { 1117 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1118 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1119 PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp)); 1120 if (ScatterAndOp) { 1121 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1122 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1123 if (direction == PETSCSF_ROOT2LEAF) { 1124 PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata)); 1125 } else { 1126 PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata)); 1127 } 1128 } else { 1129 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1130 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1131 if (direction == PETSCSF_ROOT2LEAF) { 1132 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op)); 1133 } else { 1134 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op)); 1135 } 1136 } 1137 } 1138 PetscFunctionReturn(PETSC_SUCCESS); 1139 } 1140 1141 /* Fetch rootdata to leafdata and leafupdate locally */ 1142 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) 1143 { 1144 const PetscInt *rootindices = NULL, *leafindices = NULL; 1145 PetscInt count, rootstart, leafstart; 1146 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1147 const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype; 1148 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1149 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1150 1151 PetscFunctionBegin; 1152 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS); 1153 if (rootmtype != leafmtype) { 1154 /* The local communication has to go through pack and unpack */ 1155 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1156 } else { 1157 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1158 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1159 PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal)); 1160 PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate)); 1161 } 1162 PetscFunctionReturn(PETSC_SUCCESS); 1163 } 1164 1165 /* 1166 Create per-rank pack/unpack optimizations based on indices patterns 1167 1168 Input Parameters: 1169 + n - Number of destination ranks 1170 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1171 - idx - [*] Array storing indices 1172 1173 Output Parameters: 1174 + opt - Pack optimizations. NULL if no optimizations. 1175 */ 1176 static PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) 1177 { 1178 PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y; 1179 PetscBool optimizable = PETSC_TRUE; 1180 PetscSFPackOpt opt; 1181 1182 PetscFunctionBegin; 1183 PetscCall(PetscMalloc1(1, &opt)); 1184 PetscCall(PetscMalloc1(7 * n + 2, &opt->array)); 1185 opt->n = opt->array[0] = n; 1186 opt->offset = opt->array + 1; 1187 opt->start = opt->array + n + 2; 1188 opt->dx = opt->array + 2 * n + 2; 1189 opt->dy = opt->array + 3 * n + 2; 1190 opt->dz = opt->array + 4 * n + 2; 1191 opt->X = opt->array + 5 * n + 2; 1192 opt->Y = opt->array + 6 * n + 2; 1193 1194 for (r = 0; r < n; r++) { /* For each destination rank */ 1195 m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1196 p = offset[r]; 1197 start = idx[p]; /* First index for this rank */ 1198 p++; 1199 1200 /* Search in X dimension */ 1201 for (dx = 1; dx < m; dx++, p++) { 1202 if (start + dx != idx[p]) break; 1203 } 1204 1205 dydz = m / dx; 1206 X = dydz > 1 ? (idx[p] - start) : dx; 1207 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1208 if (m % dx || X <= 0) { 1209 optimizable = PETSC_FALSE; 1210 goto finish; 1211 } 1212 for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */ 1213 for (i = 0; i < dx; i++, p++) { 1214 if (start + X * dy + i != idx[p]) { 1215 if (i) { 1216 optimizable = PETSC_FALSE; 1217 goto finish; 1218 } /* The pattern is violated in the middle of an x-walk */ 1219 else 1220 goto Z_dimension; 1221 } 1222 } 1223 } 1224 1225 Z_dimension: 1226 dz = m / (dx * dy); 1227 Y = dz > 1 ? (idx[p] - start) / X : dy; 1228 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1229 if (m % (dx * dy) || Y <= 0) { 1230 optimizable = PETSC_FALSE; 1231 goto finish; 1232 } 1233 for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1234 for (j = 0; j < dy; j++) { 1235 for (i = 0; i < dx; i++, p++) { 1236 if (start + X * Y * k + X * j + i != idx[p]) { 1237 optimizable = PETSC_FALSE; 1238 goto finish; 1239 } 1240 } 1241 } 1242 } 1243 opt->start[r] = start; 1244 opt->dx[r] = dx; 1245 opt->dy[r] = dy; 1246 opt->dz[r] = dz; 1247 opt->X[r] = X; 1248 opt->Y[r] = Y; 1249 } 1250 1251 finish: 1252 /* If not optimizable, free arrays to save memory */ 1253 if (!n || !optimizable) { 1254 PetscCall(PetscFree(opt->array)); 1255 PetscCall(PetscFree(opt)); 1256 *out = NULL; 1257 } else { 1258 opt->offset[0] = 0; 1259 for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r]; 1260 *out = opt; 1261 } 1262 PetscFunctionReturn(PETSC_SUCCESS); 1263 } 1264 1265 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) 1266 { 1267 PetscSFPackOpt opt = *out; 1268 1269 PetscFunctionBegin; 1270 if (opt) { 1271 PetscCall(PetscSFFree(sf, mtype, opt->array)); 1272 PetscCall(PetscFree(opt)); 1273 *out = NULL; 1274 } 1275 PetscFunctionReturn(PETSC_SUCCESS); 1276 } 1277 1278 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1279 { 1280 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1281 PetscInt i, j; 1282 1283 PetscFunctionBegin; 1284 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1285 for (i = 0; i < 2; i++) { /* Set defaults */ 1286 sf->leafstart[i] = 0; 1287 sf->leafcontig[i] = PETSC_TRUE; 1288 sf->leafdups[i] = PETSC_FALSE; 1289 bas->rootstart[i] = 0; 1290 bas->rootcontig[i] = PETSC_TRUE; 1291 bas->rootdups[i] = PETSC_FALSE; 1292 } 1293 1294 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1295 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1296 1297 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1298 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1299 1300 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1301 for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */ 1302 if (sf->rmine[i] != sf->leafstart[0] + i) { 1303 sf->leafcontig[0] = PETSC_FALSE; 1304 break; 1305 } 1306 } 1307 for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */ 1308 if (sf->rmine[i] != sf->leafstart[1] + j) { 1309 sf->leafcontig[1] = PETSC_FALSE; 1310 break; 1311 } 1312 } 1313 1314 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1315 if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 1316 if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1317 1318 /* Are root indices for self and remote contiguous? */ 1319 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1320 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1321 1322 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1323 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1324 1325 for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) { 1326 if (bas->irootloc[i] != bas->rootstart[0] + i) { 1327 bas->rootcontig[0] = PETSC_FALSE; 1328 break; 1329 } 1330 } 1331 for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) { 1332 if (bas->irootloc[i] != bas->rootstart[1] + j) { 1333 bas->rootcontig[1] = PETSC_FALSE; 1334 break; 1335 } 1336 } 1337 1338 if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 1339 if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1340 1341 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1342 if (PetscDefined(HAVE_DEVICE)) { 1343 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1344 if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 1345 if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1])); 1346 if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 1347 if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1348 } 1349 PetscFunctionReturn(PETSC_SUCCESS); 1350 } 1351 1352 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1353 { 1354 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1355 PetscInt i; 1356 1357 PetscFunctionBegin; 1358 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 1359 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i])); 1360 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i])); 1361 #if defined(PETSC_HAVE_DEVICE) 1362 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i])); 1363 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i])); 1364 #endif 1365 } 1366 PetscFunctionReturn(PETSC_SUCCESS); 1367 } 1368