1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op, s, t) \ 16 do { \ 17 (s) = (s)op(t); \ 18 } while (0) /* binary ops in the middle such as +, *, && etc. */ 19 #define OP_FUNCTION(op, s, t) \ 20 do { \ 21 (s) = op((s), (t)); \ 22 } while (0) /* ops like a function, such as PetscMax, PetscMin */ 23 #define OP_LXOR(op, s, t) \ 24 do { \ 25 (s) = (!(s)) != (!(t)); \ 26 } while (0) /* logical exclusive OR */ 27 #define OP_ASSIGN(op, s, t) \ 28 do { \ 29 (s) = (t); \ 30 } while (0) 31 /* Ref MPI MAXLOC */ 32 #define OP_XLOC(op, s, t) \ 33 do { \ 34 if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \ 35 else if (!((s).u op(t).u)) s = t; \ 36 } while (0) 37 38 /* DEF_PackFunc - macro defining a Pack routine 39 40 Arguments of the macro: 41 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 42 .BS Block size for vectorization. It is a factor of bsz. 43 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 44 45 Arguments of the Pack routine: 46 +count Number of indices in idx[]. 47 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 48 .opt Per-pack optimization plan. NULL means no such plan. 49 .idx Indices of entries to packed. 50 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 51 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 52 -packed Address of the packed data. 53 */ 54 #define DEF_PackFunc(Type, BS, EQ) \ 55 static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \ 56 { \ 57 const Type *u = (const Type *)unpacked, *u2; \ 58 Type *p = (Type *)packed, *p2; \ 59 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 60 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 61 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 62 PetscFunctionBegin; \ 63 if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \ 64 else if (opt) { /* has optimizations available */ p2 = p; \ 65 for (r = 0; r < opt->n; r++) { \ 66 u2 = u + opt->start[r] * MBS; \ 67 X = opt->X[r]; \ 68 Y = opt->Y[r]; \ 69 for (k = 0; k < opt->dz[r]; k++) \ 70 for (j = 0; j < opt->dy[r]; j++) { \ 71 PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \ 72 p2 += opt->dx[r] * MBS; \ 73 } \ 74 } \ 75 } else { \ 76 for (i = 0; i < count; i++) \ 77 for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 78 for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 79 p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \ 80 } \ 81 PetscFunctionReturn(PETSC_SUCCESS); \ 82 } 83 84 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 85 and inserts into a sparse array. 86 87 Arguments: 88 .Type Type of the data 89 .BS Block size for vectorization 90 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 91 92 Notes: 93 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 94 */ 95 #define DEF_UnpackFunc(Type, BS, EQ) \ 96 static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \ 97 { \ 98 Type *u = (Type *)unpacked, *u2; \ 99 const Type *p = (const Type *)packed; \ 100 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 101 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 102 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 103 PetscFunctionBegin; \ 104 if (!idx) { \ 105 u += start * MBS; \ 106 if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \ 107 } else if (opt) { /* has optimizations available */ \ 108 for (r = 0; r < opt->n; r++) { \ 109 u2 = u + opt->start[r] * MBS; \ 110 X = opt->X[r]; \ 111 Y = opt->Y[r]; \ 112 for (k = 0; k < opt->dz[r]; k++) \ 113 for (j = 0; j < opt->dy[r]; j++) { \ 114 PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \ 115 p += opt->dx[r] * MBS; \ 116 } \ 117 } \ 118 } else { \ 119 for (i = 0; i < count; i++) \ 120 for (j = 0; j < M; j++) \ 121 for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \ 122 } \ 123 PetscFunctionReturn(PETSC_SUCCESS); \ 124 } 125 126 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 127 128 Arguments: 129 +Opname Name of the Op, such as Add, Mult, LAND, etc. 130 .Type Type of the data 131 .BS Block size for vectorization 132 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 133 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 134 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 135 */ 136 #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 137 static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \ 138 { \ 139 Type *u = (Type *)unpacked, *u2; \ 140 const Type *p = (const Type *)packed; \ 141 PetscInt i, j, k, X, Y, r, bs = link->bs; \ 142 const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 143 const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 144 PetscFunctionBegin; \ 145 if (!idx) { \ 146 u += start * MBS; \ 147 for (i = 0; i < count; i++) \ 148 for (j = 0; j < M; j++) \ 149 for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r = 0; r < opt->n; r++) { \ 152 u2 = u + opt->start[r] * MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k = 0; k < opt->dz[r]; k++) \ 156 for (j = 0; j < opt->dy[r]; j++) { \ 157 for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \ 158 p += opt->dx[r] * MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i = 0; i < count; i++) \ 163 for (j = 0; j < M; j++) \ 164 for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 165 } \ 166 PetscFunctionReturn(PETSC_SUCCESS); \ 167 } 168 169 #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 170 static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \ 171 { \ 172 Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \ 173 PetscInt i, j, k, r, l, bs = link->bs; \ 174 const PetscInt M = (EQ) ? 1 : bs / BS; \ 175 const PetscInt MBS = M * BS; \ 176 PetscFunctionBegin; \ 177 for (i = 0; i < count; i++) { \ 178 r = (!idx ? start + i : idx[i]) * MBS; \ 179 l = i * MBS; \ 180 for (j = 0; j < M; j++) \ 181 for (k = 0; k < BS; k++) { \ 182 tmp = u[r + j * BS + k]; \ 183 OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \ 184 p[l + j * BS + k] = tmp; \ 185 } \ 186 } \ 187 PetscFunctionReturn(PETSC_SUCCESS); \ 188 } 189 190 #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 191 static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \ 192 { \ 193 const Type *u = (const Type *)src; \ 194 Type *v = (Type *)dst; \ 195 PetscInt i, j, k, s, t, X, Y, bs = link->bs; \ 196 const PetscInt M = (EQ) ? 1 : bs / BS; \ 197 const PetscInt MBS = M * BS; \ 198 PetscFunctionBegin; \ 199 if (!srcIdx) { /* src is contiguous */ \ 200 u += srcStart * MBS; \ 201 PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \ 202 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 203 u += srcOpt->start[0] * MBS; \ 204 v += dstStart * MBS; \ 205 X = srcOpt->X[0]; \ 206 Y = srcOpt->Y[0]; \ 207 for (k = 0; k < srcOpt->dz[0]; k++) \ 208 for (j = 0; j < srcOpt->dy[0]; j++) { \ 209 for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \ 210 v += srcOpt->dx[0] * MBS; \ 211 } \ 212 } else { /* all other cases */ \ 213 for (i = 0; i < count; i++) { \ 214 s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \ 215 t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \ 216 for (j = 0; j < M; j++) \ 217 for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \ 218 } \ 219 } \ 220 PetscFunctionReturn(PETSC_SUCCESS); \ 221 } 222 223 #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \ 224 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \ 225 { \ 226 Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \ 227 const Type *ldata = (const Type *)leafdata; \ 228 PetscInt i, j, k, r, l, bs = link->bs; \ 229 const PetscInt M = (EQ) ? 1 : bs / BS; \ 230 const PetscInt MBS = M * BS; \ 231 PetscFunctionBegin; \ 232 for (i = 0; i < count; i++) { \ 233 r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \ 234 l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \ 235 for (j = 0; j < M; j++) \ 236 for (k = 0; k < BS; k++) { \ 237 lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \ 238 OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \ 239 } \ 240 } \ 241 PetscFunctionReturn(PETSC_SUCCESS); \ 242 } 243 244 /* Pack, Unpack/Fetch ops */ 245 #define DEF_Pack(Type, BS, EQ) \ 246 DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \ 247 { \ 248 link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \ 249 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \ 250 link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \ 251 } 252 253 /* Add, Mult ops */ 254 #define DEF_Add(Type, BS, EQ) \ 255 DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \ 256 { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type, BS, EQ) \ 267 DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \ 268 { \ 269 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 270 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 271 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 272 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 273 } 274 275 /* Logical ops. 276 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 277 the compilation warning "empty macro arguments are undefined in ISO C90" 278 */ 279 #define DEF_Log(Type, BS, EQ) \ 280 DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \ 281 { \ 282 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \ 283 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \ 284 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \ 285 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \ 286 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \ 287 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \ 288 } 289 290 /* Bitwise ops */ 291 #define DEF_Bit(Type, BS, EQ) \ 292 DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \ 293 { \ 294 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \ 295 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \ 296 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \ 297 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \ 298 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \ 299 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \ 300 } 301 302 /* Maxloc, Minloc ops */ 303 #define DEF_Xloc(Type, BS, EQ) \ 304 DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \ 305 { \ 306 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 307 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 308 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 309 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 310 } 311 312 #define DEF_IntegerType(Type, BS, EQ) \ 313 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \ 314 { \ 315 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 316 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 317 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 318 CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \ 319 CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \ 320 } 321 322 #define DEF_RealType(Type, BS, EQ) \ 323 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \ 324 { \ 325 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 326 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 327 CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 328 } 329 330 #if defined(PETSC_HAVE_COMPLEX) 331 #define DEF_ComplexType(Type, BS, EQ) \ 332 DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \ 333 { \ 334 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 335 CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 336 } 337 #endif 338 339 #define DEF_DumbType(Type, BS, EQ) \ 340 DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \ 341 { \ 342 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 343 } 344 345 /* Maxloc, Minloc */ 346 #define DEF_PairType(Type, BS, EQ) \ 347 DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \ 348 { \ 349 CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 350 CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \ 351 } 352 353 DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */ 354 DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */ 355 DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */ 356 DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */ 357 DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */ 358 DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */ 359 DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */ 360 DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 361 362 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 363 DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0) 364 #endif 365 366 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 367 typedef signed char SignedChar; 368 DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0) 369 370 typedef unsigned char UnsignedChar; 371 DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0) 372 373 DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0) 374 #if defined(PETSC_HAVE_COMPLEX) 375 DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0) 376 #endif 377 378 #define PairType(Type1, Type2) Type1##_##Type2 379 typedef struct { 380 int u; 381 int i; 382 } PairType(int, int); 383 typedef struct { 384 PetscInt u; 385 PetscInt i; 386 } PairType(PetscInt, PetscInt); 387 DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1) 388 389 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 390 DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0) 391 392 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 393 DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0) 394 395 PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) 396 { 397 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 398 PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8; 399 400 PetscFunctionBegin; 401 /* Destroy device-specific fields */ 402 if (link->deviceinited) PetscCall((*link->Destroy)(sf, link)); 403 404 /* Destroy host related fields */ 405 if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 406 if (!link->use_nvshmem) { 407 for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */ 408 if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 409 } 410 PetscCall(PetscFree(link->reqs)); 411 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 412 PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 413 PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 414 } 415 } 416 PetscCall(PetscFree(link)); 417 PetscFunctionReturn(PETSC_SUCCESS); 418 } 419 420 PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 421 { 422 PetscFunctionBegin; 423 PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata)); 424 #if defined(PETSC_HAVE_NVSHMEM) 425 { 426 PetscBool use_nvshmem; 427 PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem)); 428 if (use_nvshmem) { 429 PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 } 433 #endif 434 PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 435 PetscFunctionReturn(PETSC_SUCCESS); 436 } 437 438 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 439 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 440 */ 441 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) 442 { 443 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 444 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 445 const PetscInt *rootoffset, *leafoffset; 446 MPI_Aint disp; 447 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 448 MPI_Datatype unit = link->unit; 449 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 450 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 451 452 PetscFunctionBegin; 453 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 454 if (sf->persistent) { 455 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 456 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 457 if (direction == PETSCSF_LEAF2ROOT) { 458 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 459 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 460 cnt = rootoffset[i + 1] - rootoffset[i]; 461 PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 462 } 463 } else { /* PETSCSF_ROOT2LEAF */ 464 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 465 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 466 cnt = rootoffset[i + 1] - rootoffset[i]; 467 PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 468 } 469 } 470 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 471 } 472 473 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 474 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 475 if (direction == PETSCSF_LEAF2ROOT) { 476 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 477 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 478 cnt = leafoffset[i + 1] - leafoffset[i]; 479 PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 480 } 481 } else { /* PETSCSF_ROOT2LEAF */ 482 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 483 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 484 cnt = leafoffset[i + 1] - leafoffset[i]; 485 PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 486 } 487 } 488 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 489 } 490 } 491 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 492 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 493 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 494 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 495 PetscFunctionReturn(PETSC_SUCCESS); 496 } 497 498 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) 499 { 500 PetscSFLink link, *p; 501 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 502 503 PetscFunctionBegin; 504 /* Look for types in cache */ 505 for (p = &bas->inuse; (link = *p); p = &link->next) { 506 PetscBool match; 507 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 508 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 509 switch (cmode) { 510 case PETSC_OWN_POINTER: 511 *p = link->next; 512 break; /* Remove from inuse list */ 513 case PETSC_USE_POINTER: 514 break; 515 default: 516 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode"); 517 } 518 *mylink = link; 519 PetscFunctionReturn(PETSC_SUCCESS); 520 } 521 } 522 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack"); 523 } 524 525 PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) 526 { 527 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 528 PetscSFLink link = *mylink; 529 530 PetscFunctionBegin; 531 link->rootdata = NULL; 532 link->leafdata = NULL; 533 link->next = bas->avail; 534 bas->avail = link; 535 *mylink = NULL; 536 PetscFunctionReturn(PETSC_SUCCESS); 537 } 538 539 /* Error out on unsupported overlapped communications */ 540 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) 541 { 542 PetscSFLink link, *p; 543 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 544 PetscBool match; 545 546 PetscFunctionBegin; 547 if (PetscDefined(USE_DEBUG)) { 548 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 549 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 550 */ 551 if (rootdata || leafdata) { 552 for (p = &bas->inuse; (link = *p); p = &link->next) { 553 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 554 PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata); 555 } 556 } 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) 562 { 563 PetscFunctionBegin; 564 if (n) PetscCall(PetscMemcpy(dst, src, n)); 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 568 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) 569 { 570 PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0; 571 PetscBool is2Int, is2PetscInt; 572 PetscMPIInt ni, na, nd, combiner; 573 #if defined(PETSC_HAVE_COMPLEX) 574 PetscInt nPetscComplex = 0; 575 #endif 576 577 PetscFunctionBegin; 578 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar)); 579 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar)); 580 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 581 PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt)); 582 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt)); 583 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal)); 584 #if defined(PETSC_HAVE_COMPLEX) 585 PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex)); 586 #endif 587 PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int)); 588 PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt)); 589 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 590 PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner)); 591 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 592 link->bs = 1; /* default */ 593 594 if (is2Int) { 595 PackInit_PairType_int_int_1_1(link); 596 link->bs = 1; 597 link->unitbytes = 2 * sizeof(int); 598 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 599 link->basicunit = MPI_2INT; 600 link->unit = MPI_2INT; 601 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 602 PackInit_PairType_PetscInt_PetscInt_1_1(link); 603 link->bs = 1; 604 link->unitbytes = 2 * sizeof(PetscInt); 605 link->basicunit = MPIU_2INT; 606 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 607 link->unit = MPIU_2INT; 608 } else if (nPetscReal) { 609 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); 610 else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link); 611 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); 612 else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link); 613 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); 614 else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link); 615 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); 616 else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link); 617 link->bs = nPetscReal; 618 link->unitbytes = nPetscReal * sizeof(PetscReal); 619 link->basicunit = MPIU_REAL; 620 if (link->bs == 1) { 621 link->isbuiltin = PETSC_TRUE; 622 link->unit = MPIU_REAL; 623 } 624 } else if (nPetscInt) { 625 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); 626 else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 627 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); 628 else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 629 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); 630 else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 631 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); 632 else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 633 link->bs = nPetscInt; 634 link->unitbytes = nPetscInt * sizeof(PetscInt); 635 link->basicunit = MPIU_INT; 636 if (link->bs == 1) { 637 link->isbuiltin = PETSC_TRUE; 638 link->unit = MPIU_INT; 639 } 640 #if defined(PETSC_USE_64BIT_INDICES) 641 } else if (nInt) { 642 if (nInt == 8) PackInit_IntegerType_int_8_1(link); 643 else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link); 644 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); 645 else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link); 646 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); 647 else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link); 648 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); 649 else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link); 650 link->bs = nInt; 651 link->unitbytes = nInt * sizeof(int); 652 link->basicunit = MPI_INT; 653 if (link->bs == 1) { 654 link->isbuiltin = PETSC_TRUE; 655 link->unit = MPI_INT; 656 } 657 #endif 658 } else if (nSignedChar) { 659 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); 660 else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 661 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); 662 else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 663 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); 664 else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 665 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); 666 else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 667 link->bs = nSignedChar; 668 link->unitbytes = nSignedChar * sizeof(SignedChar); 669 link->basicunit = MPI_SIGNED_CHAR; 670 if (link->bs == 1) { 671 link->isbuiltin = PETSC_TRUE; 672 link->unit = MPI_SIGNED_CHAR; 673 } 674 } else if (nUnsignedChar) { 675 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); 676 else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 677 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); 678 else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 679 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); 680 else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 681 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); 682 else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 683 link->bs = nUnsignedChar; 684 link->unitbytes = nUnsignedChar * sizeof(UnsignedChar); 685 link->basicunit = MPI_UNSIGNED_CHAR; 686 if (link->bs == 1) { 687 link->isbuiltin = PETSC_TRUE; 688 link->unit = MPI_UNSIGNED_CHAR; 689 } 690 #if defined(PETSC_HAVE_COMPLEX) 691 } else if (nPetscComplex) { 692 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); 693 else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 694 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); 695 else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 696 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); 697 else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 698 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); 699 else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 700 link->bs = nPetscComplex; 701 link->unitbytes = nPetscComplex * sizeof(PetscComplex); 702 link->basicunit = MPIU_COMPLEX; 703 if (link->bs == 1) { 704 link->isbuiltin = PETSC_TRUE; 705 link->unit = MPIU_COMPLEX; 706 } 707 #endif 708 } else { 709 MPI_Aint lb, nbyte; 710 PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte)); 711 PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 712 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 713 if (nbyte == 4) PackInit_DumbType_char_4_1(link); 714 else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link); 715 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); 716 else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link); 717 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); 718 else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link); 719 link->bs = nbyte; 720 link->unitbytes = nbyte; 721 link->basicunit = MPI_BYTE; 722 } else { 723 nInt = nbyte / sizeof(int); 724 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); 725 else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link); 726 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); 727 else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link); 728 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); 729 else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link); 730 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); 731 else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link); 732 link->bs = nInt; 733 link->unitbytes = nbyte; 734 link->basicunit = MPI_INT; 735 } 736 if (link->isbuiltin) link->unit = unit; 737 } 738 739 if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit)); 740 741 link->Memcpy = PetscSFLinkMemcpy_Host; 742 PetscFunctionReturn(PETSC_SUCCESS); 743 } 744 745 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) 746 { 747 PetscFunctionBegin; 748 *UnpackAndOp = NULL; 749 if (PetscMemTypeHost(mtype)) { 750 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 751 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 752 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 753 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 754 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 755 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 756 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 757 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 758 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 759 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 760 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 761 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 762 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 763 } 764 #if defined(PETSC_HAVE_DEVICE) 765 else if (PetscMemTypeDevice(mtype) && !atomic) { 766 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 767 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 768 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 769 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 770 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 771 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 772 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 773 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 774 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 775 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 776 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 777 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 778 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 779 } else if (PetscMemTypeDevice(mtype) && atomic) { 780 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 781 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 782 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 783 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 784 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 785 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 786 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 787 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 788 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 789 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 790 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 791 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 792 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 793 } 794 #endif 795 PetscFunctionReturn(PETSC_SUCCESS); 796 } 797 798 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) 799 { 800 PetscFunctionBegin; 801 *ScatterAndOp = NULL; 802 if (PetscMemTypeHost(mtype)) { 803 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 804 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 805 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 806 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 807 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 808 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 809 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 810 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 811 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 812 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 813 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 814 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 815 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 816 } 817 #if defined(PETSC_HAVE_DEVICE) 818 else if (PetscMemTypeDevice(mtype) && !atomic) { 819 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 820 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 821 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 822 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 823 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 824 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 825 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 826 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 827 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 828 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 829 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 830 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 831 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 832 } else if (PetscMemTypeDevice(mtype) && atomic) { 833 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 834 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 835 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 836 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 837 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 838 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 839 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 840 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 841 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 842 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 843 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 844 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 845 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 846 } 847 #endif 848 PetscFunctionReturn(PETSC_SUCCESS); 849 } 850 851 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) 852 { 853 PetscFunctionBegin; 854 *FetchAndOp = NULL; 855 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 856 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 857 #if defined(PETSC_HAVE_DEVICE) 858 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 859 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 860 #endif 861 PetscFunctionReturn(PETSC_SUCCESS); 862 } 863 864 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) 865 { 866 PetscFunctionBegin; 867 *FetchAndOpLocal = NULL; 868 PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 869 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 870 #if defined(PETSC_HAVE_DEVICE) 871 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 872 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 873 #endif 874 PetscFunctionReturn(PETSC_SUCCESS); 875 } 876 877 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) 878 { 879 PetscLogDouble flops; 880 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 881 882 PetscFunctionBegin; 883 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 884 flops = bas->rootbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 885 #if defined(PETSC_HAVE_DEVICE) 886 if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); 887 else 888 #endif 889 PetscCall(PetscLogFlops(flops)); 890 } 891 PetscFunctionReturn(PETSC_SUCCESS); 892 } 893 894 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) 895 { 896 PetscLogDouble flops; 897 898 PetscFunctionBegin; 899 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 900 flops = sf->leafbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 901 #if defined(PETSC_HAVE_DEVICE) 902 if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); 903 else 904 #endif 905 PetscCall(PetscLogFlops(flops)); 906 } 907 PetscFunctionReturn(PETSC_SUCCESS); 908 } 909 910 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 911 Input Parameters: 912 +sf - The StarForest 913 .link - The link 914 .count - Number of entries to unpack 915 .start - The first index, significant when indices=NULL 916 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 917 .buf - A contiguous buffer to unpack from 918 -op - Operation after unpack 919 920 Output Parameters: 921 .data - The data to unpack to 922 */ 923 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) 924 { 925 PetscFunctionBegin; 926 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 927 { 928 PetscInt i; 929 if (indices) { 930 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 931 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 932 */ 933 for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op)); 934 } else { 935 PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op)); 936 } 937 } 938 PetscFunctionReturn(PETSC_SUCCESS); 939 #else 940 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 941 #endif 942 } 943 944 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) 945 { 946 PetscFunctionBegin; 947 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 948 { 949 PetscInt i, disp; 950 if (!srcIdx) { 951 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op)); 952 } else { 953 for (i = 0; i < count; i++) { 954 disp = dstIdx ? dstIdx[i] : dstStart + i; 955 PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op)); 956 } 957 } 958 } 959 PetscFunctionReturn(PETSC_SUCCESS); 960 #else 961 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 962 #endif 963 } 964 965 /*============================================================================= 966 Pack/Unpack/Fetch/Scatter routines 967 ============================================================================*/ 968 969 /* Pack rootdata to rootbuf 970 Input Parameters: 971 + sf - The SF this packing works on. 972 . link - It gives the memtype of the roots and also provides root buffer. 973 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 974 - rootdata - Where to read the roots. 975 976 Notes: 977 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 978 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 979 */ 980 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) 981 { 982 const PetscInt *rootindices = NULL; 983 PetscInt count, start; 984 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 985 PetscMemType rootmtype = link->rootmtype; 986 PetscSFPackOpt opt = NULL; 987 988 PetscFunctionBegin; 989 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 990 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 991 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 992 PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack)); 993 PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 994 } 995 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 996 PetscFunctionReturn(PETSC_SUCCESS); 997 } 998 999 /* Pack leafdata to leafbuf */ 1000 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) 1001 { 1002 const PetscInt *leafindices = NULL; 1003 PetscInt count, start; 1004 PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1005 PetscMemType leafmtype = link->leafmtype; 1006 PetscSFPackOpt opt = NULL; 1007 1008 PetscFunctionBegin; 1009 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 1010 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1011 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 1012 PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack)); 1013 PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1014 } 1015 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 1016 PetscFunctionReturn(PETSC_SUCCESS); 1017 } 1018 1019 /* Pack rootdata to rootbuf, which are in the same memory space */ 1020 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) 1021 { 1022 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1023 1024 PetscFunctionBegin; 1025 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1026 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1027 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1028 } 1029 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 1030 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata)); 1031 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 1032 PetscFunctionReturn(PETSC_SUCCESS); 1033 } 1034 /* Pack leafdata to leafbuf, which are in the same memory space */ 1035 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) 1036 { 1037 PetscFunctionBegin; 1038 if (scope == PETSCSF_REMOTE) { 1039 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1040 if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1041 } 1042 PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 1043 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata)); 1044 PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 1045 PetscFunctionReturn(PETSC_SUCCESS); 1046 } 1047 1048 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) 1049 { 1050 const PetscInt *rootindices = NULL; 1051 PetscInt count, start; 1052 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1053 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1054 PetscMemType rootmtype = link->rootmtype; 1055 PetscSFPackOpt opt = NULL; 1056 1057 PetscFunctionBegin; 1058 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1059 PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp)); 1060 if (UnpackAndOp) { 1061 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 1062 PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 1063 } else { 1064 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices)); 1065 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op)); 1066 } 1067 } 1068 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op)); 1069 PetscFunctionReturn(PETSC_SUCCESS); 1070 } 1071 1072 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) 1073 { 1074 const PetscInt *leafindices = NULL; 1075 PetscInt count, start; 1076 PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1077 PetscMemType leafmtype = link->leafmtype; 1078 PetscSFPackOpt opt = NULL; 1079 1080 PetscFunctionBegin; 1081 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1082 PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp)); 1083 if (UnpackAndOp) { 1084 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 1085 PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1086 } else { 1087 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices)); 1088 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op)); 1089 } 1090 } 1091 PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op)); 1092 PetscFunctionReturn(PETSC_SUCCESS); 1093 } 1094 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1095 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) 1096 { 1097 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1098 1099 PetscFunctionBegin; 1100 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1101 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op)); 1102 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1103 if (scope == PETSCSF_REMOTE) { 1104 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1105 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1106 } 1107 PetscFunctionReturn(PETSC_SUCCESS); 1108 } 1109 1110 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1111 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) 1112 { 1113 PetscFunctionBegin; 1114 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1115 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op)); 1116 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1117 if (scope == PETSCSF_REMOTE) { 1118 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1119 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1120 } 1121 PetscFunctionReturn(PETSC_SUCCESS); 1122 } 1123 1124 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1125 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) 1126 { 1127 const PetscInt *rootindices = NULL; 1128 PetscInt count, start; 1129 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1130 PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL; 1131 PetscMemType rootmtype = link->rootmtype; 1132 PetscSFPackOpt opt = NULL; 1133 1134 PetscFunctionBegin; 1135 PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 1136 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1137 /* Do FetchAndOp on rootdata with rootbuf */ 1138 PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp)); 1139 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices)); 1140 PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1141 } 1142 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op)); 1143 PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1144 PetscFunctionReturn(PETSC_SUCCESS); 1145 } 1146 1147 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) 1148 { 1149 const PetscInt *rootindices = NULL, *leafindices = NULL; 1150 PetscInt count, rootstart, leafstart; 1151 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1152 PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL; 1153 PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype; 1154 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1155 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1156 char *srcbuf = NULL, *dstbuf = NULL; 1157 PetscBool dstdups; 1158 1159 PetscFunctionBegin; 1160 if (!buflen) PetscFunctionReturn(PETSC_SUCCESS); 1161 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1162 if (direction == PETSCSF_ROOT2LEAF) { 1163 PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata)); 1164 srcmtype = rootmtype; 1165 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1166 dstmtype = leafmtype; 1167 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1168 } else { 1169 PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata)); 1170 srcmtype = leafmtype; 1171 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1172 dstmtype = rootmtype; 1173 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1174 } 1175 PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes)); 1176 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1177 if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 1178 if (direction == PETSCSF_ROOT2LEAF) { 1179 PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op)); 1180 } else { 1181 PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op)); 1182 } 1183 } else { 1184 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1185 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1186 PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp)); 1187 if (ScatterAndOp) { 1188 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1189 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1190 if (direction == PETSCSF_ROOT2LEAF) { 1191 PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata)); 1192 } else { 1193 PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata)); 1194 } 1195 } else { 1196 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1197 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1198 if (direction == PETSCSF_ROOT2LEAF) { 1199 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op)); 1200 } else { 1201 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op)); 1202 } 1203 } 1204 } 1205 PetscFunctionReturn(PETSC_SUCCESS); 1206 } 1207 1208 /* Fetch rootdata to leafdata and leafupdate locally */ 1209 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) 1210 { 1211 const PetscInt *rootindices = NULL, *leafindices = NULL; 1212 PetscInt count, rootstart, leafstart; 1213 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1214 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1215 const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype; 1216 PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1217 1218 PetscFunctionBegin; 1219 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS); 1220 if (rootmtype != leafmtype) { 1221 /* The local communication has to go through pack and unpack */ 1222 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1223 } else { 1224 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 1225 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 1226 PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal)); 1227 PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate)); 1228 } 1229 PetscFunctionReturn(PETSC_SUCCESS); 1230 } 1231 1232 /* 1233 Create per-rank pack/unpack optimizations based on indice patterns 1234 1235 Input Parameters: 1236 + n - Number of destination ranks 1237 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1238 - idx - [*] Array storing indices 1239 1240 Output Parameters: 1241 + opt - Pack optimizations. NULL if no optimizations. 1242 */ 1243 PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) 1244 { 1245 PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y; 1246 PetscBool optimizable = PETSC_TRUE; 1247 PetscSFPackOpt opt; 1248 1249 PetscFunctionBegin; 1250 PetscCall(PetscMalloc1(1, &opt)); 1251 PetscCall(PetscMalloc1(7 * n + 2, &opt->array)); 1252 opt->n = opt->array[0] = n; 1253 opt->offset = opt->array + 1; 1254 opt->start = opt->array + n + 2; 1255 opt->dx = opt->array + 2 * n + 2; 1256 opt->dy = opt->array + 3 * n + 2; 1257 opt->dz = opt->array + 4 * n + 2; 1258 opt->X = opt->array + 5 * n + 2; 1259 opt->Y = opt->array + 6 * n + 2; 1260 1261 for (r = 0; r < n; r++) { /* For each destination rank */ 1262 m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1263 p = offset[r]; 1264 start = idx[p]; /* First index for this rank */ 1265 p++; 1266 1267 /* Search in X dimension */ 1268 for (dx = 1; dx < m; dx++, p++) { 1269 if (start + dx != idx[p]) break; 1270 } 1271 1272 dydz = m / dx; 1273 X = dydz > 1 ? (idx[p] - start) : dx; 1274 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1275 if (m % dx || X <= 0) { 1276 optimizable = PETSC_FALSE; 1277 goto finish; 1278 } 1279 for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */ 1280 for (i = 0; i < dx; i++, p++) { 1281 if (start + X * dy + i != idx[p]) { 1282 if (i) { 1283 optimizable = PETSC_FALSE; 1284 goto finish; 1285 } /* The pattern is violated in the middle of an x-walk */ 1286 else 1287 goto Z_dimension; 1288 } 1289 } 1290 } 1291 1292 Z_dimension: 1293 dz = m / (dx * dy); 1294 Y = dz > 1 ? (idx[p] - start) / X : dy; 1295 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1296 if (m % (dx * dy) || Y <= 0) { 1297 optimizable = PETSC_FALSE; 1298 goto finish; 1299 } 1300 for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1301 for (j = 0; j < dy; j++) { 1302 for (i = 0; i < dx; i++, p++) { 1303 if (start + X * Y * k + X * j + i != idx[p]) { 1304 optimizable = PETSC_FALSE; 1305 goto finish; 1306 } 1307 } 1308 } 1309 } 1310 opt->start[r] = start; 1311 opt->dx[r] = dx; 1312 opt->dy[r] = dy; 1313 opt->dz[r] = dz; 1314 opt->X[r] = X; 1315 opt->Y[r] = Y; 1316 } 1317 1318 finish: 1319 /* If not optimizable, free arrays to save memory */ 1320 if (!n || !optimizable) { 1321 PetscCall(PetscFree(opt->array)); 1322 PetscCall(PetscFree(opt)); 1323 *out = NULL; 1324 } else { 1325 opt->offset[0] = 0; 1326 for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r]; 1327 *out = opt; 1328 } 1329 PetscFunctionReturn(PETSC_SUCCESS); 1330 } 1331 1332 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) 1333 { 1334 PetscSFPackOpt opt = *out; 1335 1336 PetscFunctionBegin; 1337 if (opt) { 1338 PetscCall(PetscSFFree(sf, mtype, opt->array)); 1339 PetscCall(PetscFree(opt)); 1340 *out = NULL; 1341 } 1342 PetscFunctionReturn(PETSC_SUCCESS); 1343 } 1344 1345 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1346 { 1347 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1348 PetscInt i, j; 1349 1350 PetscFunctionBegin; 1351 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1352 for (i = 0; i < 2; i++) { /* Set defaults */ 1353 sf->leafstart[i] = 0; 1354 sf->leafcontig[i] = PETSC_TRUE; 1355 sf->leafdups[i] = PETSC_FALSE; 1356 bas->rootstart[i] = 0; 1357 bas->rootcontig[i] = PETSC_TRUE; 1358 bas->rootdups[i] = PETSC_FALSE; 1359 } 1360 1361 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1362 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1363 1364 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1365 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1366 1367 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1368 for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */ 1369 if (sf->rmine[i] != sf->leafstart[0] + i) { 1370 sf->leafcontig[0] = PETSC_FALSE; 1371 break; 1372 } 1373 } 1374 for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */ 1375 if (sf->rmine[i] != sf->leafstart[1] + j) { 1376 sf->leafcontig[1] = PETSC_FALSE; 1377 break; 1378 } 1379 } 1380 1381 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1382 if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 1383 if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1384 1385 /* Are root indices for self and remote contiguous? */ 1386 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1387 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1388 1389 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1390 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1391 1392 for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) { 1393 if (bas->irootloc[i] != bas->rootstart[0] + i) { 1394 bas->rootcontig[0] = PETSC_FALSE; 1395 break; 1396 } 1397 } 1398 for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) { 1399 if (bas->irootloc[i] != bas->rootstart[1] + j) { 1400 bas->rootcontig[1] = PETSC_FALSE; 1401 break; 1402 } 1403 } 1404 1405 if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 1406 if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1407 1408 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1409 if (PetscDefined(HAVE_DEVICE)) { 1410 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1411 if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 1412 if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1])); 1413 if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 1414 if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1415 } 1416 PetscFunctionReturn(PETSC_SUCCESS); 1417 } 1418 1419 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1420 { 1421 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1422 PetscInt i; 1423 1424 PetscFunctionBegin; 1425 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 1426 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i])); 1427 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i])); 1428 #if defined(PETSC_HAVE_DEVICE) 1429 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i])); 1430 PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i])); 1431 #endif 1432 } 1433 PetscFunctionReturn(PETSC_SUCCESS); 1434 } 1435