1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19 /* Ref MPI MAXLOC */ 20 #define OP_XLOC(op,s,t) \ 21 do { \ 22 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23 else if (!((s).u op (t).u)) s = t; \ 24 } while (0) 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 .BS Block size for vectorization. It is a factor of bsz. 31 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 32 33 Arguments of the Pack routine: 34 +count Number of indices in idx[]. 35 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36 .opt Per-pack optimization plan. NULL means no such plan. 37 .idx Indices of entries to packed. 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40 -packed Address of the packed data. 41 */ 42 #define DEF_PackFunc(Type,BS,EQ) \ 43 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44 { \ 45 PetscErrorCode ierr; \ 46 const Type *u = (const Type*)unpacked,*u2; \ 47 Type *p = (Type*)packed,*p2; \ 48 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 49 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 50 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 51 PetscFunctionBegin; \ 52 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 53 else if (opt) { /* has optimizations available */ \ 54 p2 = p; \ 55 for (r=0; r<opt->n; r++) { \ 56 u2 = u + opt->start[r]*MBS; \ 57 X = opt->X[r]; \ 58 Y = opt->Y[r]; \ 59 for (k=0; k<opt->dz[r]; k++) \ 60 for (j=0; j<opt->dy[r]; j++) { \ 61 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 62 p2 += opt->dx[r]*MBS; \ 63 } \ 64 } \ 65 } else { \ 66 for (i=0; i<count; i++) \ 67 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 68 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 69 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 70 } \ 71 PetscFunctionReturn(0); \ 72 } 73 74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 75 and inserts into a sparse array. 76 77 Arguments: 78 .Type Type of the data 79 .BS Block size for vectorization 80 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 81 82 Notes: 83 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 84 */ 85 #define DEF_UnpackFunc(Type,BS,EQ) \ 86 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 87 { \ 88 PetscErrorCode ierr; \ 89 Type *u = (Type*)unpacked,*u2; \ 90 const Type *p = (const Type*)packed; \ 91 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 92 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 93 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 94 PetscFunctionBegin; \ 95 if (!idx) { \ 96 u += start*MBS; \ 97 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 98 } else if (opt) { /* has optimizations available */ \ 99 for (r=0; r<opt->n; r++) { \ 100 u2 = u + opt->start[r]*MBS; \ 101 X = opt->X[r]; \ 102 Y = opt->Y[r]; \ 103 for (k=0; k<opt->dz[r]; k++) \ 104 for (j=0; j<opt->dy[r]; j++) { \ 105 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 106 p += opt->dx[r]*MBS; \ 107 } \ 108 } \ 109 } else { \ 110 for (i=0; i<count; i++) \ 111 for (j=0; j<M; j++) \ 112 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 113 } \ 114 PetscFunctionReturn(0); \ 115 } 116 117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 118 119 Arguments: 120 +Opname Name of the Op, such as Add, Mult, LAND, etc. 121 .Type Type of the data 122 .BS Block size for vectorization 123 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 124 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 125 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 126 */ 127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 128 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 129 { \ 130 Type *u = (Type*)unpacked,*u2; \ 131 const Type *p = (const Type*)packed; \ 132 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 133 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 134 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 135 PetscFunctionBegin; \ 136 if (!idx) { \ 137 u += start*MBS; \ 138 for (i=0; i<count; i++) \ 139 for (j=0; j<M; j++) \ 140 for (k=0; k<BS; k++) \ 141 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 142 } else if (opt) { /* idx[] has patterns */ \ 143 for (r=0; r<opt->n; r++) { \ 144 u2 = u + opt->start[r]*MBS; \ 145 X = opt->X[r]; \ 146 Y = opt->Y[r]; \ 147 for (k=0; k<opt->dz[r]; k++) \ 148 for (j=0; j<opt->dy[r]; j++) { \ 149 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 150 p += opt->dx[r]*MBS; \ 151 } \ 152 } \ 153 } else { \ 154 for (i=0; i<count; i++) \ 155 for (j=0; j<M; j++) \ 156 for (k=0; k<BS; k++) \ 157 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 158 } \ 159 PetscFunctionReturn(0); \ 160 } 161 162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 163 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 164 { \ 165 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 166 PetscInt i,j,k,r,l,bs=link->bs; \ 167 const PetscInt M = (EQ) ? 1 : bs/BS; \ 168 const PetscInt MBS = M*BS; \ 169 PetscFunctionBegin; \ 170 for (i=0; i<count; i++) { \ 171 r = (!idx ? start+i : idx[i])*MBS; \ 172 l = i*MBS; \ 173 for (j=0; j<M; j++) \ 174 for (k=0; k<BS; k++) { \ 175 tmp = u[r+j*BS+k]; \ 176 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 177 p[l+j*BS+k] = tmp; \ 178 } \ 179 } \ 180 PetscFunctionReturn(0); \ 181 } 182 183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 184 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 185 { \ 186 PetscErrorCode ierr; \ 187 const Type *u = (const Type*)src; \ 188 Type *v = (Type*)dst; \ 189 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 190 const PetscInt M = (EQ) ? 1 : bs/BS; \ 191 const PetscInt MBS = M*BS; \ 192 PetscFunctionBegin; \ 193 if (!srcIdx) { /* src is contiguous */ \ 194 u += srcStart*MBS; \ 195 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 196 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 197 u += srcOpt->start[0]*MBS; \ 198 v += dstStart*MBS; \ 199 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 200 for (k=0; k<srcOpt->dz[0]; k++) \ 201 for (j=0; j<srcOpt->dy[0]; j++) { \ 202 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 203 v += srcOpt->dx[0]*MBS; \ 204 } \ 205 } else { /* all other cases */ \ 206 for (i=0; i<count; i++) { \ 207 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 208 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 209 for (j=0; j<M; j++) \ 210 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 211 } \ 212 } \ 213 PetscFunctionReturn(0); \ 214 } 215 216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 217 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 218 { \ 219 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 220 const Type *ldata = (const Type*)leafdata; \ 221 PetscInt i,j,k,r,l,bs = link->bs; \ 222 const PetscInt M = (EQ) ? 1 : bs/BS; \ 223 const PetscInt MBS = M*BS; \ 224 PetscFunctionBegin; \ 225 for (i=0; i<count; i++) { \ 226 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 227 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 228 for (j=0; j<M; j++) \ 229 for (k=0; k<BS; k++) { \ 230 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 231 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 232 } \ 233 } \ 234 PetscFunctionReturn(0); \ 235 } 236 237 /* Pack, Unpack/Fetch ops */ 238 #define DEF_Pack(Type,BS,EQ) \ 239 DEF_PackFunc(Type,BS,EQ) \ 240 DEF_UnpackFunc(Type,BS,EQ) \ 241 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 242 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 243 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 244 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 245 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 246 } 247 248 /* Add, Mult ops */ 249 #define DEF_Add(Type,BS,EQ) \ 250 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 253 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 256 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type,BS,EQ) \ 267 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 268 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 269 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 270 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 271 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 272 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 273 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 274 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 275 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 276 } 277 278 /* Logical ops. 279 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 280 the compilation warning "empty macro arguments are undefined in ISO C90" 281 */ 282 #define DEF_Log(Type,BS,EQ) \ 283 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 284 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 285 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 286 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 287 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 288 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 289 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 291 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 292 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 293 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 294 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 295 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 296 } 297 298 /* Bitwise ops */ 299 #define DEF_Bit(Type,BS,EQ) \ 300 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 301 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 302 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 303 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 304 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 305 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 306 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 307 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 308 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 309 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 310 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 311 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 312 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 313 } 314 315 /* Maxloc, Minloc ops */ 316 #define DEF_Xloc(Type,BS,EQ) \ 317 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 318 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 319 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 320 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 321 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 322 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 323 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 324 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 325 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 326 } 327 328 #define DEF_IntegerType(Type,BS,EQ) \ 329 DEF_Pack(Type,BS,EQ) \ 330 DEF_Add(Type,BS,EQ) \ 331 DEF_Cmp(Type,BS,EQ) \ 332 DEF_Log(Type,BS,EQ) \ 333 DEF_Bit(Type,BS,EQ) \ 334 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 335 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 336 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 337 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 338 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 340 } 341 342 #define DEF_RealType(Type,BS,EQ) \ 343 DEF_Pack(Type,BS,EQ) \ 344 DEF_Add(Type,BS,EQ) \ 345 DEF_Cmp(Type,BS,EQ) \ 346 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 347 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 348 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 349 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 350 } 351 352 #if defined(PETSC_HAVE_COMPLEX) 353 #define DEF_ComplexType(Type,BS,EQ) \ 354 DEF_Pack(Type,BS,EQ) \ 355 DEF_Add(Type,BS,EQ) \ 356 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 357 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 358 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 359 } 360 #endif 361 362 #define DEF_DumbType(Type,BS,EQ) \ 363 DEF_Pack(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 } 367 368 /* Maxloc, Minloc */ 369 #define DEF_PairType(Type,BS,EQ) \ 370 DEF_Pack(Type,BS,EQ) \ 371 DEF_Xloc(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 375 } 376 377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 385 386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 387 DEF_IntegerType(int,1,1) 388 DEF_IntegerType(int,2,1) 389 DEF_IntegerType(int,4,1) 390 DEF_IntegerType(int,8,1) 391 DEF_IntegerType(int,1,0) 392 DEF_IntegerType(int,2,0) 393 DEF_IntegerType(int,4,0) 394 DEF_IntegerType(int,8,0) 395 #endif 396 397 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 398 typedef signed char SignedChar; 399 DEF_IntegerType(SignedChar,1,1) 400 DEF_IntegerType(SignedChar,2,1) 401 DEF_IntegerType(SignedChar,4,1) 402 DEF_IntegerType(SignedChar,8,1) 403 DEF_IntegerType(SignedChar,1,0) 404 DEF_IntegerType(SignedChar,2,0) 405 DEF_IntegerType(SignedChar,4,0) 406 DEF_IntegerType(SignedChar,8,0) 407 408 typedef unsigned char UnsignedChar; 409 DEF_IntegerType(UnsignedChar,1,1) 410 DEF_IntegerType(UnsignedChar,2,1) 411 DEF_IntegerType(UnsignedChar,4,1) 412 DEF_IntegerType(UnsignedChar,8,1) 413 DEF_IntegerType(UnsignedChar,1,0) 414 DEF_IntegerType(UnsignedChar,2,0) 415 DEF_IntegerType(UnsignedChar,4,0) 416 DEF_IntegerType(UnsignedChar,8,0) 417 418 DEF_RealType(PetscReal,1,1) 419 DEF_RealType(PetscReal,2,1) 420 DEF_RealType(PetscReal,4,1) 421 DEF_RealType(PetscReal,8,1) 422 DEF_RealType(PetscReal,1,0) 423 DEF_RealType(PetscReal,2,0) 424 DEF_RealType(PetscReal,4,0) 425 DEF_RealType(PetscReal,8,0) 426 427 #if defined(PETSC_HAVE_COMPLEX) 428 DEF_ComplexType(PetscComplex,1,1) 429 DEF_ComplexType(PetscComplex,2,1) 430 DEF_ComplexType(PetscComplex,4,1) 431 DEF_ComplexType(PetscComplex,8,1) 432 DEF_ComplexType(PetscComplex,1,0) 433 DEF_ComplexType(PetscComplex,2,0) 434 DEF_ComplexType(PetscComplex,4,0) 435 DEF_ComplexType(PetscComplex,8,0) 436 #endif 437 438 #define PairType(Type1,Type2) Type1##_##Type2 439 typedef struct {int u; int i;} PairType(int,int); 440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 441 DEF_PairType(PairType(int,int),1,1) 442 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 443 444 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 445 DEF_DumbType(char,1,1) 446 DEF_DumbType(char,2,1) 447 DEF_DumbType(char,4,1) 448 DEF_DumbType(char,1,0) 449 DEF_DumbType(char,2,0) 450 DEF_DumbType(char,4,0) 451 452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 453 DEF_DumbType(DumbInt,1,1) 454 DEF_DumbType(DumbInt,2,1) 455 DEF_DumbType(DumbInt,4,1) 456 DEF_DumbType(DumbInt,8,1) 457 DEF_DumbType(DumbInt,1,0) 458 DEF_DumbType(DumbInt,2,0) 459 DEF_DumbType(DumbInt,4,0) 460 DEF_DumbType(DumbInt,8,0) 461 462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 464 { 465 int ierr; 466 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 467 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 468 return MPI_SUCCESS; 469 } 470 #endif 471 472 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 473 { 474 PetscErrorCode ierr; 475 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 476 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 477 478 PetscFunctionBegin; 479 /* Destroy device-specific fields */ 480 if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 481 482 /* Destroy host related fields */ 483 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 484 if (!link->use_nvshmem) { 485 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 486 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 487 } 488 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 489 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 490 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 491 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 492 } 493 } 494 ierr = PetscFree(link);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 499 { 500 PetscErrorCode ierr; 501 502 PetscFunctionBegin; 503 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 504 #if defined(PETSC_HAVE_NVSHMEM) 505 { 506 PetscBool use_nvshmem; 507 ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 508 if (use_nvshmem) { 509 ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 510 PetscFunctionReturn(0); 511 } 512 } 513 #endif 514 ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 515 PetscFunctionReturn(0); 516 } 517 518 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 519 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 520 */ 521 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 522 { 523 PetscErrorCode ierr; 524 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 525 PetscInt i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks; 526 const PetscInt *rootoffset,*leafoffset; 527 MPI_Aint disp; 528 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 529 MPI_Datatype unit = link->unit; 530 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 531 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 532 533 PetscFunctionBegin; 534 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 535 if (sf->persistent) { 536 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 537 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 538 if (direction == PETSCSF_LEAF2ROOT) { 539 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 540 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 541 cnt = rootoffset[i+1]-rootoffset[i]; 542 ierr = MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 543 } 544 } else { /* PETSCSF_ROOT2LEAF */ 545 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 546 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 547 cnt = rootoffset[i+1]-rootoffset[i]; 548 ierr = MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 549 } 550 } 551 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 552 } 553 554 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 555 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 556 if (direction == PETSCSF_LEAF2ROOT) { 557 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 558 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 559 cnt = leafoffset[i+1]-leafoffset[i]; 560 ierr = MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 561 } 562 } else { /* PETSCSF_ROOT2LEAF */ 563 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 564 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 565 cnt = leafoffset[i+1]-leafoffset[i]; 566 ierr = MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 567 } 568 } 569 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 570 } 571 } 572 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 573 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 574 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 575 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 576 PetscFunctionReturn(0); 577 } 578 579 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 580 { 581 PetscErrorCode ierr; 582 PetscSFLink link,*p; 583 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 584 585 PetscFunctionBegin; 586 /* Look for types in cache */ 587 for (p=&bas->inuse; (link=*p); p=&link->next) { 588 PetscBool match; 589 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 590 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 591 switch (cmode) { 592 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 593 case PETSC_USE_POINTER: break; 594 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 595 } 596 *mylink = link; 597 PetscFunctionReturn(0); 598 } 599 } 600 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 601 } 602 603 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 604 { 605 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 606 PetscSFLink link = *mylink; 607 608 PetscFunctionBegin; 609 link->rootdata = NULL; 610 link->leafdata = NULL; 611 link->next = bas->avail; 612 bas->avail = link; 613 *mylink = NULL; 614 PetscFunctionReturn(0); 615 } 616 617 /* Error out on unsupported overlapped communications */ 618 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 619 { 620 PetscErrorCode ierr; 621 PetscSFLink link,*p; 622 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 623 PetscBool match; 624 625 PetscFunctionBegin; 626 if (PetscDefined(USE_DEBUG)) { 627 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 628 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 629 */ 630 if (rootdata || leafdata) { 631 for (p=&bas->inuse; (link=*p); p=&link->next) { 632 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 633 PetscAssertFalse(match && (rootdata == link->rootdata) && (leafdata == link->leafdata),PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 634 } 635 } 636 } 637 PetscFunctionReturn(0); 638 } 639 640 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 641 { 642 PetscFunctionBegin; 643 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 644 PetscFunctionReturn(0); 645 } 646 647 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 648 { 649 PetscErrorCode ierr; 650 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 651 PetscBool is2Int,is2PetscInt; 652 PetscMPIInt ni,na,nd,combiner; 653 #if defined(PETSC_HAVE_COMPLEX) 654 PetscInt nPetscComplex=0; 655 #endif 656 657 PetscFunctionBegin; 658 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 659 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 660 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 661 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 662 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 663 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 664 #if defined(PETSC_HAVE_COMPLEX) 665 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 666 #endif 667 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 668 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 669 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 670 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 671 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 672 link->bs = 1; /* default */ 673 674 if (is2Int) { 675 PackInit_PairType_int_int_1_1(link); 676 link->bs = 1; 677 link->unitbytes = 2*sizeof(int); 678 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 679 link->basicunit = MPI_2INT; 680 link->unit = MPI_2INT; 681 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 682 PackInit_PairType_PetscInt_PetscInt_1_1(link); 683 link->bs = 1; 684 link->unitbytes = 2*sizeof(PetscInt); 685 link->basicunit = MPIU_2INT; 686 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 687 link->unit = MPIU_2INT; 688 } else if (nPetscReal) { 689 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 690 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 691 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 692 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 693 link->bs = nPetscReal; 694 link->unitbytes = nPetscReal*sizeof(PetscReal); 695 link->basicunit = MPIU_REAL; 696 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 697 } else if (nPetscInt) { 698 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 699 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 700 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 701 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 702 link->bs = nPetscInt; 703 link->unitbytes = nPetscInt*sizeof(PetscInt); 704 link->basicunit = MPIU_INT; 705 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 706 #if defined(PETSC_USE_64BIT_INDICES) 707 } else if (nInt) { 708 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 709 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 710 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 711 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 712 link->bs = nInt; 713 link->unitbytes = nInt*sizeof(int); 714 link->basicunit = MPI_INT; 715 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 716 #endif 717 } else if (nSignedChar) { 718 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 719 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 720 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 721 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 722 link->bs = nSignedChar; 723 link->unitbytes = nSignedChar*sizeof(SignedChar); 724 link->basicunit = MPI_SIGNED_CHAR; 725 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 726 } else if (nUnsignedChar) { 727 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 728 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 729 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 730 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 731 link->bs = nUnsignedChar; 732 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 733 link->basicunit = MPI_UNSIGNED_CHAR; 734 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 735 #if defined(PETSC_HAVE_COMPLEX) 736 } else if (nPetscComplex) { 737 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 738 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 739 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 740 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 741 link->bs = nPetscComplex; 742 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 743 link->basicunit = MPIU_COMPLEX; 744 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 745 #endif 746 } else { 747 MPI_Aint lb,nbyte; 748 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 749 PetscAssertFalse(lb != 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb); 750 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 751 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 752 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 753 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 754 link->bs = nbyte; 755 link->unitbytes = nbyte; 756 link->basicunit = MPI_BYTE; 757 } else { 758 nInt = nbyte / sizeof(int); 759 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 760 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 761 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 762 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 763 link->bs = nInt; 764 link->unitbytes = nbyte; 765 link->basicunit = MPI_INT; 766 } 767 if (link->isbuiltin) link->unit = unit; 768 } 769 770 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 771 772 link->Memcpy = PetscSFLinkMemcpy_Host; 773 PetscFunctionReturn(0); 774 } 775 776 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 777 { 778 PetscFunctionBegin; 779 *UnpackAndOp = NULL; 780 if (PetscMemTypeHost(mtype)) { 781 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 782 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 783 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 784 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 785 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 786 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 787 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 788 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 789 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 790 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 791 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 792 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 793 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 794 } 795 #if defined(PETSC_HAVE_DEVICE) 796 else if (PetscMemTypeDevice(mtype) && !atomic) { 797 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 798 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 799 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 800 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 801 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 802 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 803 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 804 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 805 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 806 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 807 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 808 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 809 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 810 } else if (PetscMemTypeDevice(mtype) && atomic) { 811 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 812 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 813 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 814 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 815 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 816 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 817 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 818 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 819 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 820 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 821 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 822 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 823 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 824 } 825 #endif 826 PetscFunctionReturn(0); 827 } 828 829 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 830 { 831 PetscFunctionBegin; 832 *ScatterAndOp = NULL; 833 if (PetscMemTypeHost(mtype)) { 834 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 835 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 836 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 837 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 838 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 839 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 840 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 841 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 842 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 843 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 844 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 845 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 846 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 847 } 848 #if defined(PETSC_HAVE_DEVICE) 849 else if (PetscMemTypeDevice(mtype) && !atomic) { 850 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 851 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 852 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 853 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 854 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 855 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 856 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 857 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 858 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 859 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 860 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 861 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 862 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 863 } else if (PetscMemTypeDevice(mtype) && atomic) { 864 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 865 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 866 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 867 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 868 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 869 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 870 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 871 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 872 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 873 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 874 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 875 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 876 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 877 } 878 #endif 879 PetscFunctionReturn(0); 880 } 881 882 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 883 { 884 PetscFunctionBegin; 885 *FetchAndOp = NULL; 886 PetscAssertFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 887 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 888 #if defined(PETSC_HAVE_DEVICE) 889 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 890 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 891 #endif 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 896 { 897 PetscFunctionBegin; 898 *FetchAndOpLocal = NULL; 899 PetscAssertFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 900 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 901 #if defined(PETSC_HAVE_DEVICE) 902 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 903 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 904 #endif 905 PetscFunctionReturn(0); 906 } 907 908 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 909 { 910 PetscErrorCode ierr; 911 PetscLogDouble flops; 912 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 913 914 PetscFunctionBegin; 915 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 916 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 917 #if defined(PETSC_HAVE_DEVICE) 918 if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 919 #endif 920 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 921 } 922 PetscFunctionReturn(0); 923 } 924 925 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 926 { 927 PetscLogDouble flops; 928 PetscErrorCode ierr; 929 930 PetscFunctionBegin; 931 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 932 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 933 #if defined(PETSC_HAVE_DEVICE) 934 if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 935 #endif 936 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 937 } 938 PetscFunctionReturn(0); 939 } 940 941 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 942 Input Parameters: 943 +sf - The StarForest 944 .link - The link 945 .count - Number of entries to unpack 946 .start - The first index, significent when indices=NULL 947 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 948 .buf - A contiguous buffer to unpack from 949 -op - Operation after unpack 950 951 Output Parameters: 952 .data - The data to unpack to 953 */ 954 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 955 { 956 PetscFunctionBegin; 957 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 958 { 959 PetscErrorCode ierr; 960 PetscInt i; 961 if (indices) { 962 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 963 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 964 */ 965 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 966 } else { 967 ierr = MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op);CHKERRMPI(ierr); 968 } 969 } 970 PetscFunctionReturn(0); 971 #else 972 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 973 #endif 974 } 975 976 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 977 { 978 PetscFunctionBegin; 979 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 980 { 981 PetscErrorCode ierr; 982 PetscInt i,disp; 983 if (!srcIdx) { 984 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 985 } else { 986 for (i=0; i<count; i++) { 987 disp = dstIdx? dstIdx[i] : dstStart + i; 988 ierr = MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 989 } 990 } 991 } 992 PetscFunctionReturn(0); 993 #else 994 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 995 #endif 996 } 997 998 /*============================================================================= 999 Pack/Unpack/Fetch/Scatter routines 1000 ============================================================================*/ 1001 1002 /* Pack rootdata to rootbuf 1003 Input Parameters: 1004 + sf - The SF this packing works on. 1005 . link - It gives the memtype of the roots and also provides root buffer. 1006 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1007 - rootdata - Where to read the roots. 1008 1009 Notes: 1010 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1011 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1012 */ 1013 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1014 { 1015 PetscErrorCode ierr; 1016 const PetscInt *rootindices = NULL; 1017 PetscInt count,start; 1018 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1019 PetscMemType rootmtype = link->rootmtype; 1020 PetscSFPackOpt opt = NULL; 1021 1022 PetscFunctionBegin; 1023 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1024 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1025 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1026 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1027 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1028 } 1029 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1030 PetscFunctionReturn(0); 1031 } 1032 1033 /* Pack leafdata to leafbuf */ 1034 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1035 { 1036 PetscErrorCode ierr; 1037 const PetscInt *leafindices = NULL; 1038 PetscInt count,start; 1039 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1040 PetscMemType leafmtype = link->leafmtype; 1041 PetscSFPackOpt opt = NULL; 1042 1043 PetscFunctionBegin; 1044 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1045 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1046 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1047 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1048 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1049 } 1050 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1051 PetscFunctionReturn(0); 1052 } 1053 1054 /* Pack rootdata to rootbuf, which are in the same memory space */ 1055 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1056 { 1057 PetscErrorCode ierr; 1058 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1059 1060 PetscFunctionBegin; 1061 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1062 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1063 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1064 } 1065 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1066 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 1067 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1068 PetscFunctionReturn(0); 1069 } 1070 /* Pack leafdata to leafbuf, which are in the same memory space */ 1071 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1072 { 1073 PetscErrorCode ierr; 1074 1075 PetscFunctionBegin; 1076 if (scope == PETSCSF_REMOTE) { 1077 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1078 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1079 } 1080 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1081 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 1082 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1087 { 1088 PetscErrorCode ierr; 1089 const PetscInt *rootindices = NULL; 1090 PetscInt count,start; 1091 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1092 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1093 PetscMemType rootmtype = link->rootmtype; 1094 PetscSFPackOpt opt = NULL; 1095 1096 PetscFunctionBegin; 1097 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1098 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1099 if (UnpackAndOp) { 1100 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1101 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1102 } else { 1103 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1104 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1105 } 1106 } 1107 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1108 PetscFunctionReturn(0); 1109 } 1110 1111 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1112 { 1113 PetscErrorCode ierr; 1114 const PetscInt *leafindices = NULL; 1115 PetscInt count,start; 1116 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1117 PetscMemType leafmtype = link->leafmtype; 1118 PetscSFPackOpt opt = NULL; 1119 1120 PetscFunctionBegin; 1121 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1122 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1123 if (UnpackAndOp) { 1124 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1125 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1126 } else { 1127 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1128 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1129 } 1130 } 1131 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1132 PetscFunctionReturn(0); 1133 } 1134 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1135 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1136 { 1137 PetscErrorCode ierr; 1138 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1139 1140 PetscFunctionBegin; 1141 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1142 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1143 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1144 if (scope == PETSCSF_REMOTE) { 1145 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1146 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1147 } 1148 PetscFunctionReturn(0); 1149 } 1150 1151 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1152 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1153 { 1154 PetscErrorCode ierr; 1155 1156 PetscFunctionBegin; 1157 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1158 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);} 1159 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1160 if (scope == PETSCSF_REMOTE) { 1161 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1162 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1163 } 1164 PetscFunctionReturn(0); 1165 } 1166 1167 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1168 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1169 { 1170 PetscErrorCode ierr; 1171 const PetscInt *rootindices = NULL; 1172 PetscInt count,start; 1173 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1174 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1175 PetscMemType rootmtype = link->rootmtype; 1176 PetscSFPackOpt opt = NULL; 1177 1178 PetscFunctionBegin; 1179 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1180 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1181 /* Do FetchAndOp on rootdata with rootbuf */ 1182 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 1183 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1184 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1185 } 1186 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1187 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1192 { 1193 PetscErrorCode ierr; 1194 const PetscInt *rootindices = NULL,*leafindices = NULL; 1195 PetscInt count,rootstart,leafstart; 1196 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1197 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1198 PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1199 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1200 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1201 char *srcbuf = NULL,*dstbuf = NULL; 1202 PetscBool dstdups; 1203 1204 PetscFunctionBegin; 1205 if (!buflen) PetscFunctionReturn(0); 1206 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1207 if (direction == PETSCSF_ROOT2LEAF) { 1208 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1209 srcmtype = rootmtype; 1210 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1211 dstmtype = leafmtype; 1212 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1213 } else { 1214 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1215 srcmtype = leafmtype; 1216 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1217 dstmtype = rootmtype; 1218 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1219 } 1220 ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 1221 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1222 if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 1223 if (direction == PETSCSF_ROOT2LEAF) { 1224 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1225 } else { 1226 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1227 } 1228 } else { 1229 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1230 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1231 ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1232 if (ScatterAndOp) { 1233 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1234 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1235 if (direction == PETSCSF_ROOT2LEAF) { 1236 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1237 } else { 1238 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1239 } 1240 } else { 1241 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1242 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1243 if (direction == PETSCSF_ROOT2LEAF) { 1244 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1245 } else { 1246 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1247 } 1248 } 1249 } 1250 PetscFunctionReturn(0); 1251 } 1252 1253 /* Fetch rootdata to leafdata and leafupdate locally */ 1254 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1255 { 1256 PetscErrorCode ierr; 1257 const PetscInt *rootindices = NULL,*leafindices = NULL; 1258 PetscInt count,rootstart,leafstart; 1259 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1260 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1261 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1262 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1263 1264 PetscFunctionBegin; 1265 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1266 if (rootmtype != leafmtype) { 1267 /* The local communication has to go through pack and unpack */ 1268 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1269 } else { 1270 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1271 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1272 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1273 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1274 } 1275 PetscFunctionReturn(0); 1276 } 1277 1278 /* 1279 Create per-rank pack/unpack optimizations based on indice patterns 1280 1281 Input Parameters: 1282 + n - Number of destination ranks 1283 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1284 - idx - [*] Array storing indices 1285 1286 Output Parameters: 1287 + opt - Pack optimizations. NULL if no optimizations. 1288 */ 1289 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1290 { 1291 PetscErrorCode ierr; 1292 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1293 PetscBool optimizable = PETSC_TRUE; 1294 PetscSFPackOpt opt; 1295 1296 PetscFunctionBegin; 1297 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1298 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1299 opt->n = opt->array[0] = n; 1300 opt->offset = opt->array + 1; 1301 opt->start = opt->array + n + 2; 1302 opt->dx = opt->array + 2*n + 2; 1303 opt->dy = opt->array + 3*n + 2; 1304 opt->dz = opt->array + 4*n + 2; 1305 opt->X = opt->array + 5*n + 2; 1306 opt->Y = opt->array + 6*n + 2; 1307 1308 for (r=0; r<n; r++) { /* For each destination rank */ 1309 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1310 p = offset[r]; 1311 start = idx[p]; /* First index for this rank */ 1312 p++; 1313 1314 /* Search in X dimension */ 1315 for (dx=1; dx<m; dx++,p++) { 1316 if (start+dx != idx[p]) break; 1317 } 1318 1319 dydz = m/dx; 1320 X = dydz > 1 ? (idx[p]-start) : dx; 1321 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1322 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1323 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1324 for (i=0; i<dx; i++,p++) { 1325 if (start+X*dy+i != idx[p]) { 1326 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1327 else goto Z_dimension; 1328 } 1329 } 1330 } 1331 1332 Z_dimension: 1333 dz = m/(dx*dy); 1334 Y = dz > 1 ? (idx[p]-start)/X : dy; 1335 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1336 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1337 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1338 for (j=0; j<dy; j++) { 1339 for (i=0; i<dx; i++,p++) { 1340 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1341 } 1342 } 1343 } 1344 opt->start[r] = start; 1345 opt->dx[r] = dx; 1346 opt->dy[r] = dy; 1347 opt->dz[r] = dz; 1348 opt->X[r] = X; 1349 opt->Y[r] = Y; 1350 } 1351 1352 finish: 1353 /* If not optimizable, free arrays to save memory */ 1354 if (!n || !optimizable) { 1355 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1356 ierr = PetscFree(opt);CHKERRQ(ierr); 1357 *out = NULL; 1358 } else { 1359 opt->offset[0] = 0; 1360 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1361 *out = opt; 1362 } 1363 PetscFunctionReturn(0); 1364 } 1365 1366 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1367 { 1368 PetscErrorCode ierr; 1369 PetscSFPackOpt opt = *out; 1370 1371 PetscFunctionBegin; 1372 if (opt) { 1373 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1374 ierr = PetscFree(opt);CHKERRQ(ierr); 1375 *out = NULL; 1376 } 1377 PetscFunctionReturn(0); 1378 } 1379 1380 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1381 { 1382 PetscErrorCode ierr; 1383 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1384 PetscInt i,j; 1385 1386 PetscFunctionBegin; 1387 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1388 for (i=0; i<2; i++) { /* Set defaults */ 1389 sf->leafstart[i] = 0; 1390 sf->leafcontig[i] = PETSC_TRUE; 1391 sf->leafdups[i] = PETSC_FALSE; 1392 bas->rootstart[i] = 0; 1393 bas->rootcontig[i] = PETSC_TRUE; 1394 bas->rootdups[i] = PETSC_FALSE; 1395 } 1396 1397 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1398 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1399 1400 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1401 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1402 1403 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1404 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1405 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1406 } 1407 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1408 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1409 } 1410 1411 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1412 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1413 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1414 1415 /* Are root indices for self and remote contiguous? */ 1416 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1417 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1418 1419 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1420 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1421 1422 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1423 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1424 } 1425 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1426 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1427 } 1428 1429 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1430 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1431 1432 #if defined(PETSC_HAVE_DEVICE) 1433 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1434 if (PetscDefined(HAVE_DEVICE)) { 1435 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1436 if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1437 if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1438 if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1439 if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1440 } 1441 #endif 1442 PetscFunctionReturn(0); 1443 } 1444 1445 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1446 { 1447 PetscErrorCode ierr; 1448 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1449 PetscInt i; 1450 1451 PetscFunctionBegin; 1452 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1453 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1454 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1455 #if defined(PETSC_HAVE_DEVICE) 1456 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1457 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1458 #endif 1459 } 1460 PetscFunctionReturn(0); 1461 } 1462