1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19 /* Ref MPI MAXLOC */ 20 #define OP_XLOC(op,s,t) \ 21 do { \ 22 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23 else if (!((s).u op (t).u)) s = t; \ 24 } while (0) 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 .BS Block size for vectorization. It is a factor of bsz. 31 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 32 33 Arguments of the Pack routine: 34 +count Number of indices in idx[]. 35 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36 .opt Per-pack optimization plan. NULL means no such plan. 37 .idx Indices of entries to packed. 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40 -packed Address of the packed data. 41 */ 42 #define DEF_PackFunc(Type,BS,EQ) \ 43 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44 { \ 45 const Type *u = (const Type*)unpacked,*u2; \ 46 Type *p = (Type*)packed,*p2; \ 47 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 48 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 49 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 50 PetscFunctionBegin; \ 51 if (!idx) PetscCall(PetscArraycpy(p,u+start*MBS,MBS*count));/* idx[] are contiguous */ \ 52 else if (opt) { /* has optimizations available */ \ 53 p2 = p; \ 54 for (r=0; r<opt->n; r++) { \ 55 u2 = u + opt->start[r]*MBS; \ 56 X = opt->X[r]; \ 57 Y = opt->Y[r]; \ 58 for (k=0; k<opt->dz[r]; k++) \ 59 for (j=0; j<opt->dy[r]; j++) { \ 60 PetscCall(PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS)); \ 61 p2 += opt->dx[r]*MBS; \ 62 } \ 63 } \ 64 } else { \ 65 for (i=0; i<count; i++) \ 66 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 67 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 68 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 69 } \ 70 PetscFunctionReturn(0); \ 71 } 72 73 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 74 and inserts into a sparse array. 75 76 Arguments: 77 .Type Type of the data 78 .BS Block size for vectorization 79 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 80 81 Notes: 82 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 83 */ 84 #define DEF_UnpackFunc(Type,BS,EQ) \ 85 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 86 { \ 87 Type *u = (Type*)unpacked,*u2; \ 88 const Type *p = (const Type*)packed; \ 89 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 90 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 91 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 92 PetscFunctionBegin; \ 93 if (!idx) { \ 94 u += start*MBS; \ 95 if (u != p) PetscCall(PetscArraycpy(u,p,count*MBS)); \ 96 } else if (opt) { /* has optimizations available */ \ 97 for (r=0; r<opt->n; r++) { \ 98 u2 = u + opt->start[r]*MBS; \ 99 X = opt->X[r]; \ 100 Y = opt->Y[r]; \ 101 for (k=0; k<opt->dz[r]; k++) \ 102 for (j=0; j<opt->dy[r]; j++) { \ 103 PetscCall(PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS)); \ 104 p += opt->dx[r]*MBS; \ 105 } \ 106 } \ 107 } else { \ 108 for (i=0; i<count; i++) \ 109 for (j=0; j<M; j++) \ 110 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 111 } \ 112 PetscFunctionReturn(0); \ 113 } 114 115 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 116 117 Arguments: 118 +Opname Name of the Op, such as Add, Mult, LAND, etc. 119 .Type Type of the data 120 .BS Block size for vectorization 121 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 122 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 123 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 124 */ 125 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 126 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 127 { \ 128 Type *u = (Type*)unpacked,*u2; \ 129 const Type *p = (const Type*)packed; \ 130 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 131 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 132 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 133 PetscFunctionBegin; \ 134 if (!idx) { \ 135 u += start*MBS; \ 136 for (i=0; i<count; i++) \ 137 for (j=0; j<M; j++) \ 138 for (k=0; k<BS; k++) \ 139 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 140 } else if (opt) { /* idx[] has patterns */ \ 141 for (r=0; r<opt->n; r++) { \ 142 u2 = u + opt->start[r]*MBS; \ 143 X = opt->X[r]; \ 144 Y = opt->Y[r]; \ 145 for (k=0; k<opt->dz[r]; k++) \ 146 for (j=0; j<opt->dy[r]; j++) { \ 147 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 148 p += opt->dx[r]*MBS; \ 149 } \ 150 } \ 151 } else { \ 152 for (i=0; i<count; i++) \ 153 for (j=0; j<M; j++) \ 154 for (k=0; k<BS; k++) \ 155 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 156 } \ 157 PetscFunctionReturn(0); \ 158 } 159 160 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 161 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 162 { \ 163 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 164 PetscInt i,j,k,r,l,bs=link->bs; \ 165 const PetscInt M = (EQ) ? 1 : bs/BS; \ 166 const PetscInt MBS = M*BS; \ 167 PetscFunctionBegin; \ 168 for (i=0; i<count; i++) { \ 169 r = (!idx ? start+i : idx[i])*MBS; \ 170 l = i*MBS; \ 171 for (j=0; j<M; j++) \ 172 for (k=0; k<BS; k++) { \ 173 tmp = u[r+j*BS+k]; \ 174 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 175 p[l+j*BS+k] = tmp; \ 176 } \ 177 } \ 178 PetscFunctionReturn(0); \ 179 } 180 181 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 182 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 183 { \ 184 const Type *u = (const Type*)src; \ 185 Type *v = (Type*)dst; \ 186 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 187 const PetscInt M = (EQ) ? 1 : bs/BS; \ 188 const PetscInt MBS = M*BS; \ 189 PetscFunctionBegin; \ 190 if (!srcIdx) { /* src is contiguous */ \ 191 u += srcStart*MBS; \ 192 PetscCall(CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u)); \ 193 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 194 u += srcOpt->start[0]*MBS; \ 195 v += dstStart*MBS; \ 196 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 197 for (k=0; k<srcOpt->dz[0]; k++) \ 198 for (j=0; j<srcOpt->dy[0]; j++) { \ 199 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 200 v += srcOpt->dx[0]*MBS; \ 201 } \ 202 } else { /* all other cases */ \ 203 for (i=0; i<count; i++) { \ 204 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 205 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 206 for (j=0; j<M; j++) \ 207 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 208 } \ 209 } \ 210 PetscFunctionReturn(0); \ 211 } 212 213 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 214 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 215 { \ 216 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 217 const Type *ldata = (const Type*)leafdata; \ 218 PetscInt i,j,k,r,l,bs = link->bs; \ 219 const PetscInt M = (EQ) ? 1 : bs/BS; \ 220 const PetscInt MBS = M*BS; \ 221 PetscFunctionBegin; \ 222 for (i=0; i<count; i++) { \ 223 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 224 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 225 for (j=0; j<M; j++) \ 226 for (k=0; k<BS; k++) { \ 227 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 228 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 229 } \ 230 } \ 231 PetscFunctionReturn(0); \ 232 } 233 234 /* Pack, Unpack/Fetch ops */ 235 #define DEF_Pack(Type,BS,EQ) \ 236 DEF_PackFunc(Type,BS,EQ) \ 237 DEF_UnpackFunc(Type,BS,EQ) \ 238 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 239 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 240 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 241 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 242 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 243 } 244 245 /* Add, Mult ops */ 246 #define DEF_Add(Type,BS,EQ) \ 247 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 248 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 249 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 250 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 253 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 254 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 255 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 256 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 257 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 258 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 259 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 260 } 261 262 /* Max, Min ops */ 263 #define DEF_Cmp(Type,BS,EQ) \ 264 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 265 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 266 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 267 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 268 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 269 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 270 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 271 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 272 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 273 } 274 275 /* Logical ops. 276 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 277 the compilation warning "empty macro arguments are undefined in ISO C90" 278 */ 279 #define DEF_Log(Type,BS,EQ) \ 280 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 281 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 282 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 283 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 284 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 285 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 286 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 287 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 288 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 289 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 290 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 291 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 292 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 293 } 294 295 /* Bitwise ops */ 296 #define DEF_Bit(Type,BS,EQ) \ 297 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 298 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 299 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 300 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 301 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 302 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 303 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 304 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 305 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 306 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 307 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 308 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 309 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 310 } 311 312 /* Maxloc, Minloc ops */ 313 #define DEF_Xloc(Type,BS,EQ) \ 314 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 315 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 316 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 317 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 318 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 319 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 320 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 321 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 322 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 323 } 324 325 #define DEF_IntegerType(Type,BS,EQ) \ 326 DEF_Pack(Type,BS,EQ) \ 327 DEF_Add(Type,BS,EQ) \ 328 DEF_Cmp(Type,BS,EQ) \ 329 DEF_Log(Type,BS,EQ) \ 330 DEF_Bit(Type,BS,EQ) \ 331 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 332 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 333 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 334 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 335 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 336 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 337 } 338 339 #define DEF_RealType(Type,BS,EQ) \ 340 DEF_Pack(Type,BS,EQ) \ 341 DEF_Add(Type,BS,EQ) \ 342 DEF_Cmp(Type,BS,EQ) \ 343 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 344 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 345 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 346 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 347 } 348 349 #if defined(PETSC_HAVE_COMPLEX) 350 #define DEF_ComplexType(Type,BS,EQ) \ 351 DEF_Pack(Type,BS,EQ) \ 352 DEF_Add(Type,BS,EQ) \ 353 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 354 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 355 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 356 } 357 #endif 358 359 #define DEF_DumbType(Type,BS,EQ) \ 360 DEF_Pack(Type,BS,EQ) \ 361 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 362 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 363 } 364 365 /* Maxloc, Minloc */ 366 #define DEF_PairType(Type,BS,EQ) \ 367 DEF_Pack(Type,BS,EQ) \ 368 DEF_Xloc(Type,BS,EQ) \ 369 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 370 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 371 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 372 } 373 374 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 375 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 376 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 377 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 378 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 379 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 380 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 381 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 382 383 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 384 DEF_IntegerType(int,1,1) 385 DEF_IntegerType(int,2,1) 386 DEF_IntegerType(int,4,1) 387 DEF_IntegerType(int,8,1) 388 DEF_IntegerType(int,1,0) 389 DEF_IntegerType(int,2,0) 390 DEF_IntegerType(int,4,0) 391 DEF_IntegerType(int,8,0) 392 #endif 393 394 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 395 typedef signed char SignedChar; 396 DEF_IntegerType(SignedChar,1,1) 397 DEF_IntegerType(SignedChar,2,1) 398 DEF_IntegerType(SignedChar,4,1) 399 DEF_IntegerType(SignedChar,8,1) 400 DEF_IntegerType(SignedChar,1,0) 401 DEF_IntegerType(SignedChar,2,0) 402 DEF_IntegerType(SignedChar,4,0) 403 DEF_IntegerType(SignedChar,8,0) 404 405 typedef unsigned char UnsignedChar; 406 DEF_IntegerType(UnsignedChar,1,1) 407 DEF_IntegerType(UnsignedChar,2,1) 408 DEF_IntegerType(UnsignedChar,4,1) 409 DEF_IntegerType(UnsignedChar,8,1) 410 DEF_IntegerType(UnsignedChar,1,0) 411 DEF_IntegerType(UnsignedChar,2,0) 412 DEF_IntegerType(UnsignedChar,4,0) 413 DEF_IntegerType(UnsignedChar,8,0) 414 415 DEF_RealType(PetscReal,1,1) 416 DEF_RealType(PetscReal,2,1) 417 DEF_RealType(PetscReal,4,1) 418 DEF_RealType(PetscReal,8,1) 419 DEF_RealType(PetscReal,1,0) 420 DEF_RealType(PetscReal,2,0) 421 DEF_RealType(PetscReal,4,0) 422 DEF_RealType(PetscReal,8,0) 423 424 #if defined(PETSC_HAVE_COMPLEX) 425 DEF_ComplexType(PetscComplex,1,1) 426 DEF_ComplexType(PetscComplex,2,1) 427 DEF_ComplexType(PetscComplex,4,1) 428 DEF_ComplexType(PetscComplex,8,1) 429 DEF_ComplexType(PetscComplex,1,0) 430 DEF_ComplexType(PetscComplex,2,0) 431 DEF_ComplexType(PetscComplex,4,0) 432 DEF_ComplexType(PetscComplex,8,0) 433 #endif 434 435 #define PairType(Type1,Type2) Type1##_##Type2 436 typedef struct {int u; int i;} PairType(int,int); 437 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 438 DEF_PairType(PairType(int,int),1,1) 439 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 440 441 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 442 DEF_DumbType(char,1,1) 443 DEF_DumbType(char,2,1) 444 DEF_DumbType(char,4,1) 445 DEF_DumbType(char,1,0) 446 DEF_DumbType(char,2,0) 447 DEF_DumbType(char,4,0) 448 449 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 450 DEF_DumbType(DumbInt,1,1) 451 DEF_DumbType(DumbInt,2,1) 452 DEF_DumbType(DumbInt,4,1) 453 DEF_DumbType(DumbInt,8,1) 454 DEF_DumbType(DumbInt,1,0) 455 DEF_DumbType(DumbInt,2,0) 456 DEF_DumbType(DumbInt,4,0) 457 DEF_DumbType(DumbInt,8,0) 458 459 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 460 { 461 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 462 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 463 464 PetscFunctionBegin; 465 /* Destroy device-specific fields */ 466 if (link->deviceinited) PetscCall((*link->Destroy)(sf,link)); 467 468 /* Destroy host related fields */ 469 if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 470 if (!link->use_nvshmem) { 471 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 472 if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 473 } 474 PetscCall(PetscFree(link->reqs)); 475 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 476 PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 477 PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 478 } 479 } 480 PetscCall(PetscFree(link)); 481 PetscFunctionReturn(0); 482 } 483 484 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 485 { 486 PetscFunctionBegin; 487 PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata)); 488 #if defined(PETSC_HAVE_NVSHMEM) 489 { 490 PetscBool use_nvshmem; 491 PetscCall(PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem)); 492 if (use_nvshmem) { 493 PetscCall(PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink)); 494 PetscFunctionReturn(0); 495 } 496 } 497 #endif 498 PetscCall(PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink)); 499 PetscFunctionReturn(0); 500 } 501 502 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 503 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 504 */ 505 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 506 { 507 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 508 PetscInt i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks; 509 const PetscInt *rootoffset,*leafoffset; 510 MPI_Aint disp; 511 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 512 MPI_Datatype unit = link->unit; 513 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 514 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 515 516 PetscFunctionBegin; 517 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 518 if (sf->persistent) { 519 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 520 PetscCall(PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL)); 521 if (direction == PETSCSF_LEAF2ROOT) { 522 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 523 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 524 cnt = rootoffset[i+1]-rootoffset[i]; 525 PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j)); 526 } 527 } else { /* PETSCSF_ROOT2LEAF */ 528 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 529 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 530 cnt = rootoffset[i+1]-rootoffset[i]; 531 PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j)); 532 } 533 } 534 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 535 } 536 537 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 538 PetscCall(PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL)); 539 if (direction == PETSCSF_LEAF2ROOT) { 540 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 541 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 542 cnt = leafoffset[i+1]-leafoffset[i]; 543 PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j)); 544 } 545 } else { /* PETSCSF_ROOT2LEAF */ 546 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 547 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 548 cnt = leafoffset[i+1]-leafoffset[i]; 549 PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j)); 550 } 551 } 552 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 553 } 554 } 555 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 556 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 557 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 558 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 559 PetscFunctionReturn(0); 560 } 561 562 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 563 { 564 PetscSFLink link,*p; 565 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 566 567 PetscFunctionBegin; 568 /* Look for types in cache */ 569 for (p=&bas->inuse; (link=*p); p=&link->next) { 570 PetscBool match; 571 PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match)); 572 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 573 switch (cmode) { 574 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 575 case PETSC_USE_POINTER: break; 576 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 577 } 578 *mylink = link; 579 PetscFunctionReturn(0); 580 } 581 } 582 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 583 } 584 585 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 586 { 587 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 588 PetscSFLink link = *mylink; 589 590 PetscFunctionBegin; 591 link->rootdata = NULL; 592 link->leafdata = NULL; 593 link->next = bas->avail; 594 bas->avail = link; 595 *mylink = NULL; 596 PetscFunctionReturn(0); 597 } 598 599 /* Error out on unsupported overlapped communications */ 600 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 601 { 602 PetscSFLink link,*p; 603 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 604 PetscBool match; 605 606 PetscFunctionBegin; 607 if (PetscDefined(USE_DEBUG)) { 608 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 609 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 610 */ 611 if (rootdata || leafdata) { 612 for (p=&bas->inuse; (link=*p); p=&link->next) { 613 PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match)); 614 PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata,PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 615 } 616 } 617 } 618 PetscFunctionReturn(0); 619 } 620 621 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 622 { 623 PetscFunctionBegin; 624 if (n) PetscCall(PetscMemcpy(dst,src,n)); 625 PetscFunctionReturn(0); 626 } 627 628 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 629 { 630 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 631 PetscBool is2Int,is2PetscInt; 632 PetscMPIInt ni,na,nd,combiner; 633 #if defined(PETSC_HAVE_COMPLEX) 634 PetscInt nPetscComplex=0; 635 #endif 636 637 PetscFunctionBegin; 638 PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar)); 639 PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar)); 640 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 641 PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt)); 642 PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt)); 643 PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal)); 644 #if defined(PETSC_HAVE_COMPLEX) 645 PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex)); 646 #endif 647 PetscCall(MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int)); 648 PetscCall(MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt)); 649 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 650 PetscCallMPI(MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner)); 651 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 652 link->bs = 1; /* default */ 653 654 if (is2Int) { 655 PackInit_PairType_int_int_1_1(link); 656 link->bs = 1; 657 link->unitbytes = 2*sizeof(int); 658 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 659 link->basicunit = MPI_2INT; 660 link->unit = MPI_2INT; 661 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 662 PackInit_PairType_PetscInt_PetscInt_1_1(link); 663 link->bs = 1; 664 link->unitbytes = 2*sizeof(PetscInt); 665 link->basicunit = MPIU_2INT; 666 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 667 link->unit = MPIU_2INT; 668 } else if (nPetscReal) { 669 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 670 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 671 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 672 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 673 link->bs = nPetscReal; 674 link->unitbytes = nPetscReal*sizeof(PetscReal); 675 link->basicunit = MPIU_REAL; 676 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 677 } else if (nPetscInt) { 678 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 679 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 680 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 681 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 682 link->bs = nPetscInt; 683 link->unitbytes = nPetscInt*sizeof(PetscInt); 684 link->basicunit = MPIU_INT; 685 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 686 #if defined(PETSC_USE_64BIT_INDICES) 687 } else if (nInt) { 688 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 689 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 690 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 691 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 692 link->bs = nInt; 693 link->unitbytes = nInt*sizeof(int); 694 link->basicunit = MPI_INT; 695 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 696 #endif 697 } else if (nSignedChar) { 698 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 699 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 700 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 701 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 702 link->bs = nSignedChar; 703 link->unitbytes = nSignedChar*sizeof(SignedChar); 704 link->basicunit = MPI_SIGNED_CHAR; 705 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 706 } else if (nUnsignedChar) { 707 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 708 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 709 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 710 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 711 link->bs = nUnsignedChar; 712 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 713 link->basicunit = MPI_UNSIGNED_CHAR; 714 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 715 #if defined(PETSC_HAVE_COMPLEX) 716 } else if (nPetscComplex) { 717 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 718 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 719 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 720 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 721 link->bs = nPetscComplex; 722 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 723 link->basicunit = MPIU_COMPLEX; 724 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 725 #endif 726 } else { 727 MPI_Aint lb,nbyte; 728 PetscCallMPI(MPI_Type_get_extent(unit,&lb,&nbyte)); 729 PetscCheck(lb == 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb); 730 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 731 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 732 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 733 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 734 link->bs = nbyte; 735 link->unitbytes = nbyte; 736 link->basicunit = MPI_BYTE; 737 } else { 738 nInt = nbyte / sizeof(int); 739 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 740 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 741 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 742 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 743 link->bs = nInt; 744 link->unitbytes = nbyte; 745 link->basicunit = MPI_INT; 746 } 747 if (link->isbuiltin) link->unit = unit; 748 } 749 750 if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit,&link->unit)); 751 752 link->Memcpy = PetscSFLinkMemcpy_Host; 753 PetscFunctionReturn(0); 754 } 755 756 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 757 { 758 PetscFunctionBegin; 759 *UnpackAndOp = NULL; 760 if (PetscMemTypeHost(mtype)) { 761 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 762 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 763 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 764 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 765 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 766 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 767 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 768 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 769 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 770 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 771 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 772 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 773 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 774 } 775 #if defined(PETSC_HAVE_DEVICE) 776 else if (PetscMemTypeDevice(mtype) && !atomic) { 777 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 778 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 779 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 780 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 781 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 782 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 783 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 784 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 785 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 786 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 787 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 788 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 789 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 790 } else if (PetscMemTypeDevice(mtype) && atomic) { 791 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 792 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 793 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 794 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 795 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 796 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 797 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 798 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 799 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 800 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 801 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 802 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 803 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 804 } 805 #endif 806 PetscFunctionReturn(0); 807 } 808 809 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 810 { 811 PetscFunctionBegin; 812 *ScatterAndOp = NULL; 813 if (PetscMemTypeHost(mtype)) { 814 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 815 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 816 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 817 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 818 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 819 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 820 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 821 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 822 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 823 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 824 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 825 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 826 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 827 } 828 #if defined(PETSC_HAVE_DEVICE) 829 else if (PetscMemTypeDevice(mtype) && !atomic) { 830 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 831 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 832 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 833 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 834 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 835 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 836 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 837 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 838 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 839 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 840 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 841 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 842 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 843 } else if (PetscMemTypeDevice(mtype) && atomic) { 844 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 845 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 846 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 847 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 848 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 849 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 850 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 851 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 852 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 853 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 854 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 855 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 856 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 857 } 858 #endif 859 PetscFunctionReturn(0); 860 } 861 862 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 863 { 864 PetscFunctionBegin; 865 *FetchAndOp = NULL; 866 PetscCheck(op == MPI_SUM || op == MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 867 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 868 #if defined(PETSC_HAVE_DEVICE) 869 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 870 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 871 #endif 872 PetscFunctionReturn(0); 873 } 874 875 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 876 { 877 PetscFunctionBegin; 878 *FetchAndOpLocal = NULL; 879 PetscCheck(op == MPI_SUM || op == MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 880 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 881 #if defined(PETSC_HAVE_DEVICE) 882 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 883 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 884 #endif 885 PetscFunctionReturn(0); 886 } 887 888 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 889 { 890 PetscLogDouble flops; 891 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 892 893 PetscFunctionBegin; 894 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 895 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 896 #if defined(PETSC_HAVE_DEVICE) 897 if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); else 898 #endif 899 PetscCall(PetscLogFlops(flops)); 900 } 901 PetscFunctionReturn(0); 902 } 903 904 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 905 { 906 PetscLogDouble flops; 907 908 PetscFunctionBegin; 909 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 910 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 911 #if defined(PETSC_HAVE_DEVICE) 912 if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); else 913 #endif 914 PetscCall(PetscLogFlops(flops)); 915 } 916 PetscFunctionReturn(0); 917 } 918 919 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 920 Input Parameters: 921 +sf - The StarForest 922 .link - The link 923 .count - Number of entries to unpack 924 .start - The first index, significent when indices=NULL 925 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 926 .buf - A contiguous buffer to unpack from 927 -op - Operation after unpack 928 929 Output Parameters: 930 .data - The data to unpack to 931 */ 932 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 933 { 934 PetscFunctionBegin; 935 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 936 { 937 PetscInt i; 938 if (indices) { 939 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 940 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 941 */ 942 for (i=0; i<count; i++) PetscCallMPI(MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op)); 943 } else { 944 PetscCallMPI(MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op)); 945 } 946 } 947 PetscFunctionReturn(0); 948 #else 949 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 950 #endif 951 } 952 953 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 954 { 955 PetscFunctionBegin; 956 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 957 { 958 PetscInt i,disp; 959 if (!srcIdx) { 960 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op)); 961 } else { 962 for (i=0; i<count; i++) { 963 disp = dstIdx? dstIdx[i] : dstStart + i; 964 PetscCallMPI(MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op)); 965 } 966 } 967 } 968 PetscFunctionReturn(0); 969 #else 970 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 971 #endif 972 } 973 974 /*============================================================================= 975 Pack/Unpack/Fetch/Scatter routines 976 ============================================================================*/ 977 978 /* Pack rootdata to rootbuf 979 Input Parameters: 980 + sf - The SF this packing works on. 981 . link - It gives the memtype of the roots and also provides root buffer. 982 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 983 - rootdata - Where to read the roots. 984 985 Notes: 986 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 987 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 988 */ 989 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 990 { 991 const PetscInt *rootindices = NULL; 992 PetscInt count,start; 993 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 994 PetscMemType rootmtype = link->rootmtype; 995 PetscSFPackOpt opt = NULL; 996 997 PetscFunctionBegin; 998 PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 999 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1000 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices)); 1001 PetscCall(PetscSFLinkGetPack(link,rootmtype,&Pack)); 1002 PetscCall((*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype])); 1003 } 1004 PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 1005 PetscFunctionReturn(0); 1006 } 1007 1008 /* Pack leafdata to leafbuf */ 1009 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1010 { 1011 const PetscInt *leafindices = NULL; 1012 PetscInt count,start; 1013 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1014 PetscMemType leafmtype = link->leafmtype; 1015 PetscSFPackOpt opt = NULL; 1016 1017 PetscFunctionBegin; 1018 PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 1019 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1020 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices)); 1021 PetscCall(PetscSFLinkGetPack(link,leafmtype,&Pack)); 1022 PetscCall((*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype])); 1023 } 1024 PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 1025 PetscFunctionReturn(0); 1026 } 1027 1028 /* Pack rootdata to rootbuf, which are in the same memory space */ 1029 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1030 { 1031 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1032 1033 PetscFunctionBegin; 1034 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1035 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1036 if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1037 } 1038 PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 1039 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf,link,scope,rootdata)); 1040 PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 1041 PetscFunctionReturn(0); 1042 } 1043 /* Pack leafdata to leafbuf, which are in the same memory space */ 1044 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1045 { 1046 PetscFunctionBegin; 1047 if (scope == PETSCSF_REMOTE) { 1048 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1049 if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1050 } 1051 PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 1052 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata)); 1053 PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 1054 PetscFunctionReturn(0); 1055 } 1056 1057 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1058 { 1059 const PetscInt *rootindices = NULL; 1060 PetscInt count,start; 1061 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1062 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1063 PetscMemType rootmtype = link->rootmtype; 1064 PetscSFPackOpt opt = NULL; 1065 1066 PetscFunctionBegin; 1067 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1068 PetscCall(PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp)); 1069 if (UnpackAndOp) { 1070 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices)); 1071 PetscCall((*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype])); 1072 } else { 1073 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices)); 1074 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op)); 1075 } 1076 } 1077 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op)); 1078 PetscFunctionReturn(0); 1079 } 1080 1081 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1082 { 1083 const PetscInt *leafindices = NULL; 1084 PetscInt count,start; 1085 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1086 PetscMemType leafmtype = link->leafmtype; 1087 PetscSFPackOpt opt = NULL; 1088 1089 PetscFunctionBegin; 1090 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1091 PetscCall(PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp)); 1092 if (UnpackAndOp) { 1093 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices)); 1094 PetscCall((*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype])); 1095 } else { 1096 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices)); 1097 PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op)); 1098 } 1099 } 1100 PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op)); 1101 PetscFunctionReturn(0); 1102 } 1103 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1104 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1105 { 1106 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1107 1108 PetscFunctionBegin; 1109 PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0)); 1110 if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op)); 1111 PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0)); 1112 if (scope == PETSCSF_REMOTE) { 1113 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 1114 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1115 } 1116 PetscFunctionReturn(0); 1117 } 1118 1119 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1120 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1121 { 1122 PetscFunctionBegin; 1123 PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0)); 1124 if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op)); 1125 PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0)); 1126 if (scope == PETSCSF_REMOTE) { 1127 if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 1128 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 1129 } 1130 PetscFunctionReturn(0); 1131 } 1132 1133 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1134 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1135 { 1136 const PetscInt *rootindices = NULL; 1137 PetscInt count,start; 1138 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1139 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1140 PetscMemType rootmtype = link->rootmtype; 1141 PetscSFPackOpt opt = NULL; 1142 1143 PetscFunctionBegin; 1144 PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0)); 1145 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1146 /* Do FetchAndOp on rootdata with rootbuf */ 1147 PetscCall(PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp)); 1148 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices)); 1149 PetscCall((*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1150 } 1151 PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op)); 1152 PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0)); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1157 { 1158 const PetscInt *rootindices = NULL,*leafindices = NULL; 1159 PetscInt count,rootstart,leafstart; 1160 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1161 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1162 PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1163 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1164 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1165 char *srcbuf = NULL,*dstbuf = NULL; 1166 PetscBool dstdups; 1167 1168 PetscFunctionBegin; 1169 if (!buflen) PetscFunctionReturn(0); 1170 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1171 if (direction == PETSCSF_ROOT2LEAF) { 1172 PetscCall(PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata)); 1173 srcmtype = rootmtype; 1174 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1175 dstmtype = leafmtype; 1176 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1177 } else { 1178 PetscCall(PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata)); 1179 srcmtype = leafmtype; 1180 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1181 dstmtype = rootmtype; 1182 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1183 } 1184 PetscCall((*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes)); 1185 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1186 if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 1187 if (direction == PETSCSF_ROOT2LEAF) { 1188 PetscCall(PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op)); 1189 } else { 1190 PetscCall(PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op)); 1191 } 1192 } else { 1193 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1194 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1195 PetscCall(PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp)); 1196 if (ScatterAndOp) { 1197 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices)); 1198 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices)); 1199 if (direction == PETSCSF_ROOT2LEAF) { 1200 PetscCall((*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata)); 1201 } else { 1202 PetscCall((*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata)); 1203 } 1204 } else { 1205 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices)); 1206 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices)); 1207 if (direction == PETSCSF_ROOT2LEAF) { 1208 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op)); 1209 } else { 1210 PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op)); 1211 } 1212 } 1213 } 1214 PetscFunctionReturn(0); 1215 } 1216 1217 /* Fetch rootdata to leafdata and leafupdate locally */ 1218 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1219 { 1220 const PetscInt *rootindices = NULL,*leafindices = NULL; 1221 PetscInt count,rootstart,leafstart; 1222 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1223 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1224 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1225 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1226 1227 PetscFunctionBegin; 1228 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1229 if (rootmtype != leafmtype) { 1230 /* The local communication has to go through pack and unpack */ 1231 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1232 } else { 1233 PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices)); 1234 PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices)); 1235 PetscCall(PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal)); 1236 PetscCall((*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate)); 1237 } 1238 PetscFunctionReturn(0); 1239 } 1240 1241 /* 1242 Create per-rank pack/unpack optimizations based on indice patterns 1243 1244 Input Parameters: 1245 + n - Number of destination ranks 1246 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1247 - idx - [*] Array storing indices 1248 1249 Output Parameters: 1250 + opt - Pack optimizations. NULL if no optimizations. 1251 */ 1252 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1253 { 1254 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1255 PetscBool optimizable = PETSC_TRUE; 1256 PetscSFPackOpt opt; 1257 1258 PetscFunctionBegin; 1259 PetscCall(PetscMalloc1(1,&opt)); 1260 PetscCall(PetscMalloc1(7*n+2,&opt->array)); 1261 opt->n = opt->array[0] = n; 1262 opt->offset = opt->array + 1; 1263 opt->start = opt->array + n + 2; 1264 opt->dx = opt->array + 2*n + 2; 1265 opt->dy = opt->array + 3*n + 2; 1266 opt->dz = opt->array + 4*n + 2; 1267 opt->X = opt->array + 5*n + 2; 1268 opt->Y = opt->array + 6*n + 2; 1269 1270 for (r=0; r<n; r++) { /* For each destination rank */ 1271 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1272 p = offset[r]; 1273 start = idx[p]; /* First index for this rank */ 1274 p++; 1275 1276 /* Search in X dimension */ 1277 for (dx=1; dx<m; dx++,p++) { 1278 if (start+dx != idx[p]) break; 1279 } 1280 1281 dydz = m/dx; 1282 X = dydz > 1 ? (idx[p]-start) : dx; 1283 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1284 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1285 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1286 for (i=0; i<dx; i++,p++) { 1287 if (start+X*dy+i != idx[p]) { 1288 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1289 else goto Z_dimension; 1290 } 1291 } 1292 } 1293 1294 Z_dimension: 1295 dz = m/(dx*dy); 1296 Y = dz > 1 ? (idx[p]-start)/X : dy; 1297 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1298 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1299 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1300 for (j=0; j<dy; j++) { 1301 for (i=0; i<dx; i++,p++) { 1302 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1303 } 1304 } 1305 } 1306 opt->start[r] = start; 1307 opt->dx[r] = dx; 1308 opt->dy[r] = dy; 1309 opt->dz[r] = dz; 1310 opt->X[r] = X; 1311 opt->Y[r] = Y; 1312 } 1313 1314 finish: 1315 /* If not optimizable, free arrays to save memory */ 1316 if (!n || !optimizable) { 1317 PetscCall(PetscFree(opt->array)); 1318 PetscCall(PetscFree(opt)); 1319 *out = NULL; 1320 } else { 1321 opt->offset[0] = 0; 1322 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1323 *out = opt; 1324 } 1325 PetscFunctionReturn(0); 1326 } 1327 1328 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1329 { 1330 PetscSFPackOpt opt = *out; 1331 1332 PetscFunctionBegin; 1333 if (opt) { 1334 PetscCall(PetscSFFree(sf,mtype,opt->array)); 1335 PetscCall(PetscFree(opt)); 1336 *out = NULL; 1337 } 1338 PetscFunctionReturn(0); 1339 } 1340 1341 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1342 { 1343 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1344 PetscInt i,j; 1345 1346 PetscFunctionBegin; 1347 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1348 for (i=0; i<2; i++) { /* Set defaults */ 1349 sf->leafstart[i] = 0; 1350 sf->leafcontig[i] = PETSC_TRUE; 1351 sf->leafdups[i] = PETSC_FALSE; 1352 bas->rootstart[i] = 0; 1353 bas->rootcontig[i] = PETSC_TRUE; 1354 bas->rootdups[i] = PETSC_FALSE; 1355 } 1356 1357 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1358 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1359 1360 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1361 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1362 1363 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1364 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1365 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1366 } 1367 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1368 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1369 } 1370 1371 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1372 if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 1373 if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1374 1375 /* Are root indices for self and remote contiguous? */ 1376 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1377 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1378 1379 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1380 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1381 1382 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1383 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1384 } 1385 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1386 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1387 } 1388 1389 if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 1390 if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1391 1392 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1393 if (PetscDefined(HAVE_DEVICE)) { 1394 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1395 if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 1396 if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1])); 1397 if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 1398 if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1399 } 1400 PetscFunctionReturn(0); 1401 } 1402 1403 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1404 { 1405 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1406 PetscInt i; 1407 1408 PetscFunctionBegin; 1409 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1410 PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i])); 1411 PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i])); 1412 #if defined(PETSC_HAVE_DEVICE) 1413 PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i])); 1414 PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i])); 1415 #endif 1416 } 1417 PetscFunctionReturn(0); 1418 } 1419