1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19 /* Ref MPI MAXLOC */ 20 #define OP_XLOC(op,s,t) \ 21 do { \ 22 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23 else if (!((s).u op (t).u)) s = t; \ 24 } while (0) 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 .BS Block size for vectorization. It is a factor of bsz. 31 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 32 33 Arguments of the Pack routine: 34 +count Number of indices in idx[]. 35 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36 .opt Per-pack optimization plan. NULL means no such plan. 37 .idx Indices of entries to packed. 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40 -packed Address of the packed data. 41 */ 42 #define DEF_PackFunc(Type,BS,EQ) \ 43 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44 { \ 45 PetscErrorCode ierr; \ 46 const Type *u = (const Type*)unpacked,*u2; \ 47 Type *p = (Type*)packed,*p2; \ 48 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 49 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 50 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 51 PetscFunctionBegin; \ 52 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 53 else if (opt) { /* has optimizations available */ \ 54 p2 = p; \ 55 for (r=0; r<opt->n; r++) { \ 56 u2 = u + opt->start[r]*MBS; \ 57 X = opt->X[r]; \ 58 Y = opt->Y[r]; \ 59 for (k=0; k<opt->dz[r]; k++) \ 60 for (j=0; j<opt->dy[r]; j++) { \ 61 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 62 p2 += opt->dx[r]*MBS; \ 63 } \ 64 } \ 65 } else { \ 66 for (i=0; i<count; i++) \ 67 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 68 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 69 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 70 } \ 71 PetscFunctionReturn(0); \ 72 } 73 74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 75 and inserts into a sparse array. 76 77 Arguments: 78 .Type Type of the data 79 .BS Block size for vectorization 80 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 81 82 Notes: 83 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 84 */ 85 #define DEF_UnpackFunc(Type,BS,EQ) \ 86 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 87 { \ 88 PetscErrorCode ierr; \ 89 Type *u = (Type*)unpacked,*u2; \ 90 const Type *p = (const Type*)packed; \ 91 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 92 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 93 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 94 PetscFunctionBegin; \ 95 if (!idx) { \ 96 u += start*MBS; \ 97 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 98 } else if (opt) { /* has optimizations available */ \ 99 for (r=0; r<opt->n; r++) { \ 100 u2 = u + opt->start[r]*MBS; \ 101 X = opt->X[r]; \ 102 Y = opt->Y[r]; \ 103 for (k=0; k<opt->dz[r]; k++) \ 104 for (j=0; j<opt->dy[r]; j++) { \ 105 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 106 p += opt->dx[r]*MBS; \ 107 } \ 108 } \ 109 } else { \ 110 for (i=0; i<count; i++) \ 111 for (j=0; j<M; j++) \ 112 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 113 } \ 114 PetscFunctionReturn(0); \ 115 } 116 117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 118 119 Arguments: 120 +Opname Name of the Op, such as Add, Mult, LAND, etc. 121 .Type Type of the data 122 .BS Block size for vectorization 123 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 124 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 125 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 126 */ 127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 128 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 129 { \ 130 Type *u = (Type*)unpacked,*u2; \ 131 const Type *p = (const Type*)packed; \ 132 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 133 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 134 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 135 PetscFunctionBegin; \ 136 if (!idx) { \ 137 u += start*MBS; \ 138 for (i=0; i<count; i++) \ 139 for (j=0; j<M; j++) \ 140 for (k=0; k<BS; k++) \ 141 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 142 } else if (opt) { /* idx[] has patterns */ \ 143 for (r=0; r<opt->n; r++) { \ 144 u2 = u + opt->start[r]*MBS; \ 145 X = opt->X[r]; \ 146 Y = opt->Y[r]; \ 147 for (k=0; k<opt->dz[r]; k++) \ 148 for (j=0; j<opt->dy[r]; j++) { \ 149 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 150 p += opt->dx[r]*MBS; \ 151 } \ 152 } \ 153 } else { \ 154 for (i=0; i<count; i++) \ 155 for (j=0; j<M; j++) \ 156 for (k=0; k<BS; k++) \ 157 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 158 } \ 159 PetscFunctionReturn(0); \ 160 } 161 162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 163 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 164 { \ 165 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 166 PetscInt i,j,k,r,l,bs=link->bs; \ 167 const PetscInt M = (EQ) ? 1 : bs/BS; \ 168 const PetscInt MBS = M*BS; \ 169 PetscFunctionBegin; \ 170 for (i=0; i<count; i++) { \ 171 r = (!idx ? start+i : idx[i])*MBS; \ 172 l = i*MBS; \ 173 for (j=0; j<M; j++) \ 174 for (k=0; k<BS; k++) { \ 175 tmp = u[r+j*BS+k]; \ 176 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 177 p[l+j*BS+k] = tmp; \ 178 } \ 179 } \ 180 PetscFunctionReturn(0); \ 181 } 182 183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 184 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 185 { \ 186 PetscErrorCode ierr; \ 187 const Type *u = (const Type*)src; \ 188 Type *v = (Type*)dst; \ 189 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 190 const PetscInt M = (EQ) ? 1 : bs/BS; \ 191 const PetscInt MBS = M*BS; \ 192 PetscFunctionBegin; \ 193 if (!srcIdx) { /* src is contiguous */ \ 194 u += srcStart*MBS; \ 195 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 196 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 197 u += srcOpt->start[0]*MBS; \ 198 v += dstStart*MBS; \ 199 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 200 for (k=0; k<srcOpt->dz[0]; k++) \ 201 for (j=0; j<srcOpt->dy[0]; j++) { \ 202 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 203 v += srcOpt->dx[0]*MBS; \ 204 } \ 205 } else { /* all other cases */ \ 206 for (i=0; i<count; i++) { \ 207 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 208 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 209 for (j=0; j<M; j++) \ 210 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 211 } \ 212 } \ 213 PetscFunctionReturn(0); \ 214 } 215 216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 217 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 218 { \ 219 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 220 const Type *ldata = (const Type*)leafdata; \ 221 PetscInt i,j,k,r,l,bs = link->bs; \ 222 const PetscInt M = (EQ) ? 1 : bs/BS; \ 223 const PetscInt MBS = M*BS; \ 224 PetscFunctionBegin; \ 225 for (i=0; i<count; i++) { \ 226 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 227 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 228 for (j=0; j<M; j++) \ 229 for (k=0; k<BS; k++) { \ 230 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 231 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 232 } \ 233 } \ 234 PetscFunctionReturn(0); \ 235 } 236 237 /* Pack, Unpack/Fetch ops */ 238 #define DEF_Pack(Type,BS,EQ) \ 239 DEF_PackFunc(Type,BS,EQ) \ 240 DEF_UnpackFunc(Type,BS,EQ) \ 241 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 242 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 243 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 244 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 245 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 246 } 247 248 /* Add, Mult ops */ 249 #define DEF_Add(Type,BS,EQ) \ 250 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 253 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 256 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type,BS,EQ) \ 267 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 268 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 269 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 270 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 271 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 272 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 273 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 274 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 275 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 276 } 277 278 /* Logical ops. 279 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 280 the compilation warning "empty macro arguments are undefined in ISO C90" 281 */ 282 #define DEF_Log(Type,BS,EQ) \ 283 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 284 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 285 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 286 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 287 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 288 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 289 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 291 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 292 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 293 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 294 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 295 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 296 } 297 298 /* Bitwise ops */ 299 #define DEF_Bit(Type,BS,EQ) \ 300 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 301 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 302 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 303 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 304 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 305 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 306 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 307 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 308 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 309 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 310 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 311 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 312 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 313 } 314 315 /* Maxloc, Minloc ops */ 316 #define DEF_Xloc(Type,BS,EQ) \ 317 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 318 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 319 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 320 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 321 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 322 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 323 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 324 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 325 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 326 } 327 328 #define DEF_IntegerType(Type,BS,EQ) \ 329 DEF_Pack(Type,BS,EQ) \ 330 DEF_Add(Type,BS,EQ) \ 331 DEF_Cmp(Type,BS,EQ) \ 332 DEF_Log(Type,BS,EQ) \ 333 DEF_Bit(Type,BS,EQ) \ 334 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 335 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 336 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 337 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 338 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 340 } 341 342 #define DEF_RealType(Type,BS,EQ) \ 343 DEF_Pack(Type,BS,EQ) \ 344 DEF_Add(Type,BS,EQ) \ 345 DEF_Cmp(Type,BS,EQ) \ 346 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 347 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 348 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 349 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 350 } 351 352 #if defined(PETSC_HAVE_COMPLEX) 353 #define DEF_ComplexType(Type,BS,EQ) \ 354 DEF_Pack(Type,BS,EQ) \ 355 DEF_Add(Type,BS,EQ) \ 356 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 357 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 358 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 359 } 360 #endif 361 362 #define DEF_DumbType(Type,BS,EQ) \ 363 DEF_Pack(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 } 367 368 /* Maxloc, Minloc */ 369 #define DEF_PairType(Type,BS,EQ) \ 370 DEF_Pack(Type,BS,EQ) \ 371 DEF_Xloc(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 375 } 376 377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 385 386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 387 DEF_IntegerType(int,1,1) 388 DEF_IntegerType(int,2,1) 389 DEF_IntegerType(int,4,1) 390 DEF_IntegerType(int,8,1) 391 DEF_IntegerType(int,1,0) 392 DEF_IntegerType(int,2,0) 393 DEF_IntegerType(int,4,0) 394 DEF_IntegerType(int,8,0) 395 #endif 396 397 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 398 typedef signed char SignedChar; 399 DEF_IntegerType(SignedChar,1,1) 400 DEF_IntegerType(SignedChar,2,1) 401 DEF_IntegerType(SignedChar,4,1) 402 DEF_IntegerType(SignedChar,8,1) 403 DEF_IntegerType(SignedChar,1,0) 404 DEF_IntegerType(SignedChar,2,0) 405 DEF_IntegerType(SignedChar,4,0) 406 DEF_IntegerType(SignedChar,8,0) 407 408 typedef unsigned char UnsignedChar; 409 DEF_IntegerType(UnsignedChar,1,1) 410 DEF_IntegerType(UnsignedChar,2,1) 411 DEF_IntegerType(UnsignedChar,4,1) 412 DEF_IntegerType(UnsignedChar,8,1) 413 DEF_IntegerType(UnsignedChar,1,0) 414 DEF_IntegerType(UnsignedChar,2,0) 415 DEF_IntegerType(UnsignedChar,4,0) 416 DEF_IntegerType(UnsignedChar,8,0) 417 418 DEF_RealType(PetscReal,1,1) 419 DEF_RealType(PetscReal,2,1) 420 DEF_RealType(PetscReal,4,1) 421 DEF_RealType(PetscReal,8,1) 422 DEF_RealType(PetscReal,1,0) 423 DEF_RealType(PetscReal,2,0) 424 DEF_RealType(PetscReal,4,0) 425 DEF_RealType(PetscReal,8,0) 426 427 #if defined(PETSC_HAVE_COMPLEX) 428 DEF_ComplexType(PetscComplex,1,1) 429 DEF_ComplexType(PetscComplex,2,1) 430 DEF_ComplexType(PetscComplex,4,1) 431 DEF_ComplexType(PetscComplex,8,1) 432 DEF_ComplexType(PetscComplex,1,0) 433 DEF_ComplexType(PetscComplex,2,0) 434 DEF_ComplexType(PetscComplex,4,0) 435 DEF_ComplexType(PetscComplex,8,0) 436 #endif 437 438 #define PairType(Type1,Type2) Type1##_##Type2 439 typedef struct {int u; int i;} PairType(int,int); 440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 441 DEF_PairType(PairType(int,int),1,1) 442 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 443 444 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 445 DEF_DumbType(char,1,1) 446 DEF_DumbType(char,2,1) 447 DEF_DumbType(char,4,1) 448 DEF_DumbType(char,1,0) 449 DEF_DumbType(char,2,0) 450 DEF_DumbType(char,4,0) 451 452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 453 DEF_DumbType(DumbInt,1,1) 454 DEF_DumbType(DumbInt,2,1) 455 DEF_DumbType(DumbInt,4,1) 456 DEF_DumbType(DumbInt,8,1) 457 DEF_DumbType(DumbInt,1,0) 458 DEF_DumbType(DumbInt,2,0) 459 DEF_DumbType(DumbInt,4,0) 460 DEF_DumbType(DumbInt,8,0) 461 462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 464 { 465 int ierr; 466 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 467 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 468 return MPI_SUCCESS; 469 } 470 #endif 471 472 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 473 { 474 PetscErrorCode ierr; 475 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 476 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 477 478 PetscFunctionBegin; 479 /* Destroy device-specific fields */ 480 if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 481 482 /* Destroy host related fields */ 483 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 484 if (!link->use_nvshmem) { 485 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 486 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 487 } 488 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 489 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 490 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 491 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 492 } 493 } 494 ierr = PetscFree(link);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 499 { 500 PetscErrorCode ierr; 501 502 PetscFunctionBegin; 503 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 504 #if defined(PETSC_HAVE_NVSHMEM) 505 { 506 PetscBool use_nvshmem; 507 ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 508 if (use_nvshmem) { 509 ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 510 PetscFunctionReturn(0); 511 } 512 } 513 #endif 514 ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 515 PetscFunctionReturn(0); 516 } 517 518 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 519 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 520 */ 521 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 522 { 523 PetscErrorCode ierr; 524 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 525 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 526 const PetscInt *rootoffset,*leafoffset; 527 PetscMPIInt n; 528 MPI_Aint disp; 529 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 530 MPI_Datatype unit = link->unit; 531 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 532 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 533 534 PetscFunctionBegin; 535 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 536 if (sf->persistent) { 537 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 538 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 539 if (direction == PETSCSF_LEAF2ROOT) { 540 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 541 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 542 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 543 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 544 } 545 } else { /* PETSCSF_ROOT2LEAF */ 546 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 547 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 548 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 549 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 550 } 551 } 552 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 553 } 554 555 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 556 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 557 if (direction == PETSCSF_LEAF2ROOT) { 558 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 559 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 560 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 561 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 562 } 563 } else { /* PETSCSF_ROOT2LEAF */ 564 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 565 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 566 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 567 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 568 } 569 } 570 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 571 } 572 } 573 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 574 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 575 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 576 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 577 PetscFunctionReturn(0); 578 } 579 580 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 581 { 582 PetscErrorCode ierr; 583 PetscSFLink link,*p; 584 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 585 586 PetscFunctionBegin; 587 /* Look for types in cache */ 588 for (p=&bas->inuse; (link=*p); p=&link->next) { 589 PetscBool match; 590 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 591 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 592 switch (cmode) { 593 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 594 case PETSC_USE_POINTER: break; 595 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 596 } 597 *mylink = link; 598 PetscFunctionReturn(0); 599 } 600 } 601 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 602 } 603 604 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 605 { 606 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 607 PetscSFLink link = *mylink; 608 609 PetscFunctionBegin; 610 link->rootdata = NULL; 611 link->leafdata = NULL; 612 link->next = bas->avail; 613 bas->avail = link; 614 *mylink = NULL; 615 PetscFunctionReturn(0); 616 } 617 618 /* Error out on unsupported overlapped communications */ 619 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 620 { 621 PetscErrorCode ierr; 622 PetscSFLink link,*p; 623 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 624 PetscBool match; 625 626 PetscFunctionBegin; 627 if (PetscDefined(USE_DEBUG)) { 628 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 629 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 630 */ 631 if (rootdata || leafdata) { 632 for (p=&bas->inuse; (link=*p); p=&link->next) { 633 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 634 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 635 } 636 } 637 } 638 PetscFunctionReturn(0); 639 } 640 641 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 642 { 643 PetscFunctionBegin; 644 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 645 PetscFunctionReturn(0); 646 } 647 648 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 649 { 650 PetscErrorCode ierr; 651 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 652 PetscBool is2Int,is2PetscInt; 653 PetscMPIInt ni,na,nd,combiner; 654 #if defined(PETSC_HAVE_COMPLEX) 655 PetscInt nPetscComplex=0; 656 #endif 657 658 PetscFunctionBegin; 659 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 660 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 661 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 662 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 663 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 664 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 665 #if defined(PETSC_HAVE_COMPLEX) 666 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 667 #endif 668 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 669 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 670 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 671 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 672 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 673 link->bs = 1; /* default */ 674 675 if (is2Int) { 676 PackInit_PairType_int_int_1_1(link); 677 link->bs = 1; 678 link->unitbytes = 2*sizeof(int); 679 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 680 link->basicunit = MPI_2INT; 681 link->unit = MPI_2INT; 682 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 683 PackInit_PairType_PetscInt_PetscInt_1_1(link); 684 link->bs = 1; 685 link->unitbytes = 2*sizeof(PetscInt); 686 link->basicunit = MPIU_2INT; 687 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 688 link->unit = MPIU_2INT; 689 } else if (nPetscReal) { 690 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 691 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 692 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 693 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 694 link->bs = nPetscReal; 695 link->unitbytes = nPetscReal*sizeof(PetscReal); 696 link->basicunit = MPIU_REAL; 697 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 698 } else if (nPetscInt) { 699 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 700 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 701 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 702 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 703 link->bs = nPetscInt; 704 link->unitbytes = nPetscInt*sizeof(PetscInt); 705 link->basicunit = MPIU_INT; 706 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 707 #if defined(PETSC_USE_64BIT_INDICES) 708 } else if (nInt) { 709 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 710 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 711 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 712 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 713 link->bs = nInt; 714 link->unitbytes = nInt*sizeof(int); 715 link->basicunit = MPI_INT; 716 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 717 #endif 718 } else if (nSignedChar) { 719 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 720 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 721 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 722 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 723 link->bs = nSignedChar; 724 link->unitbytes = nSignedChar*sizeof(SignedChar); 725 link->basicunit = MPI_SIGNED_CHAR; 726 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 727 } else if (nUnsignedChar) { 728 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 729 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 730 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 731 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 732 link->bs = nUnsignedChar; 733 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 734 link->basicunit = MPI_UNSIGNED_CHAR; 735 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 736 #if defined(PETSC_HAVE_COMPLEX) 737 } else if (nPetscComplex) { 738 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 739 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 740 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 741 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 742 link->bs = nPetscComplex; 743 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 744 link->basicunit = MPIU_COMPLEX; 745 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 746 #endif 747 } else { 748 MPI_Aint lb,nbyte; 749 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 750 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 751 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 752 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 753 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 754 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 755 link->bs = nbyte; 756 link->unitbytes = nbyte; 757 link->basicunit = MPI_BYTE; 758 } else { 759 nInt = nbyte / sizeof(int); 760 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 761 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 762 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 763 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 764 link->bs = nInt; 765 link->unitbytes = nbyte; 766 link->basicunit = MPI_INT; 767 } 768 if (link->isbuiltin) link->unit = unit; 769 } 770 771 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 772 773 link->Memcpy = PetscSFLinkMemcpy_Host; 774 PetscFunctionReturn(0); 775 } 776 777 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 778 { 779 PetscFunctionBegin; 780 *UnpackAndOp = NULL; 781 if (PetscMemTypeHost(mtype)) { 782 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 783 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 784 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 785 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 786 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 787 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 788 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 789 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 790 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 791 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 792 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 793 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 794 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 795 } 796 #if defined(PETSC_HAVE_DEVICE) 797 else if (PetscMemTypeDevice(mtype) && !atomic) { 798 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 799 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 800 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 801 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 802 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 803 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 804 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 805 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 806 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 807 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 808 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 809 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 810 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 811 } else if (PetscMemTypeDevice(mtype) && atomic) { 812 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 813 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 814 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 815 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 816 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 817 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 818 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 819 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 820 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 821 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 822 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 823 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 824 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 825 } 826 #endif 827 PetscFunctionReturn(0); 828 } 829 830 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 831 { 832 PetscFunctionBegin; 833 *ScatterAndOp = NULL; 834 if (PetscMemTypeHost(mtype)) { 835 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 836 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 837 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 838 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 839 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 840 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 841 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 842 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 843 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 844 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 845 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 846 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 847 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 848 } 849 #if defined(PETSC_HAVE_DEVICE) 850 else if (PetscMemTypeDevice(mtype) && !atomic) { 851 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 852 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 853 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 854 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 855 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 856 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 857 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 858 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 859 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 860 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 861 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 862 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 863 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 864 } else if (PetscMemTypeDevice(mtype) && atomic) { 865 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 866 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 867 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 868 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 869 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 870 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 871 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 872 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 873 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 874 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 875 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 876 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 877 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 878 } 879 #endif 880 PetscFunctionReturn(0); 881 } 882 883 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 884 { 885 PetscFunctionBegin; 886 *FetchAndOp = NULL; 887 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 888 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 889 #if defined(PETSC_HAVE_DEVICE) 890 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 891 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 892 #endif 893 PetscFunctionReturn(0); 894 } 895 896 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 897 { 898 PetscFunctionBegin; 899 *FetchAndOpLocal = NULL; 900 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 901 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 902 #if defined(PETSC_HAVE_DEVICE) 903 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 904 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 905 #endif 906 PetscFunctionReturn(0); 907 } 908 909 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 910 { 911 PetscErrorCode ierr; 912 PetscLogDouble flops; 913 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 914 915 PetscFunctionBegin; 916 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 917 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 918 #if defined(PETSC_HAVE_DEVICE) 919 if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 920 #endif 921 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 922 } 923 PetscFunctionReturn(0); 924 } 925 926 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 927 { 928 PetscLogDouble flops; 929 PetscErrorCode ierr; 930 931 PetscFunctionBegin; 932 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 933 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 934 #if defined(PETSC_HAVE_DEVICE) 935 if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 936 #endif 937 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 938 } 939 PetscFunctionReturn(0); 940 } 941 942 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 943 Input Arguments: 944 +sf - The StarForest 945 .link - The link 946 .count - Number of entries to unpack 947 .start - The first index, significent when indices=NULL 948 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 949 .buf - A contiguous buffer to unpack from 950 -op - Operation after unpack 951 952 Output Arguments: 953 .data - The data to unpack to 954 */ 955 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 956 { 957 PetscFunctionBegin; 958 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 959 { 960 PetscErrorCode ierr; 961 PetscInt i; 962 PetscMPIInt n; 963 if (indices) { 964 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 965 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 966 */ 967 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 968 } else { 969 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 970 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); 971 } 972 } 973 PetscFunctionReturn(0); 974 #else 975 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 976 #endif 977 } 978 979 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 980 { 981 PetscFunctionBegin; 982 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 983 { 984 PetscErrorCode ierr; 985 PetscInt i,disp; 986 if (!srcIdx) { 987 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 988 } else { 989 for (i=0; i<count; i++) { 990 disp = dstIdx? dstIdx[i] : dstStart + i; 991 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 992 } 993 } 994 } 995 PetscFunctionReturn(0); 996 #else 997 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 998 #endif 999 } 1000 1001 /*============================================================================= 1002 Pack/Unpack/Fetch/Scatter routines 1003 ============================================================================*/ 1004 1005 /* Pack rootdata to rootbuf 1006 Input Arguments: 1007 + sf - The SF this packing works on. 1008 . link - It gives the memtype of the roots and also provides root buffer. 1009 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1010 - rootdata - Where to read the roots. 1011 1012 Notes: 1013 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1014 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1015 */ 1016 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1017 { 1018 PetscErrorCode ierr; 1019 const PetscInt *rootindices = NULL; 1020 PetscInt count,start; 1021 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1022 PetscMemType rootmtype = link->rootmtype; 1023 PetscSFPackOpt opt = NULL; 1024 1025 PetscFunctionBegin; 1026 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1027 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1028 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1029 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1030 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1031 } 1032 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 1036 /* Pack leafdata to leafbuf */ 1037 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1038 { 1039 PetscErrorCode ierr; 1040 const PetscInt *leafindices = NULL; 1041 PetscInt count,start; 1042 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1043 PetscMemType leafmtype = link->leafmtype; 1044 PetscSFPackOpt opt = NULL; 1045 1046 PetscFunctionBegin; 1047 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1048 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1049 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1050 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1051 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1052 } 1053 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1054 PetscFunctionReturn(0); 1055 } 1056 1057 /* Pack rootdata to rootbuf, which are in the same memory space */ 1058 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1059 { 1060 PetscErrorCode ierr; 1061 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1062 1063 PetscFunctionBegin; 1064 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1065 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1066 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1067 } 1068 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1069 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 1070 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 /* Pack leafdata to leafbuf, which are in the same memory space */ 1074 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1075 { 1076 PetscErrorCode ierr; 1077 1078 PetscFunctionBegin; 1079 if (scope == PETSCSF_REMOTE) { 1080 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1081 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1082 } 1083 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1084 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 1085 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1086 PetscFunctionReturn(0); 1087 } 1088 1089 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1090 { 1091 PetscErrorCode ierr; 1092 const PetscInt *rootindices = NULL; 1093 PetscInt count,start; 1094 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1095 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1096 PetscMemType rootmtype = link->rootmtype; 1097 PetscSFPackOpt opt = NULL; 1098 1099 PetscFunctionBegin; 1100 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1101 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1102 if (UnpackAndOp) { 1103 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1104 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1105 } else { 1106 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1107 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1108 } 1109 } 1110 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1115 { 1116 PetscErrorCode ierr; 1117 const PetscInt *leafindices = NULL; 1118 PetscInt count,start; 1119 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1120 PetscMemType leafmtype = link->leafmtype; 1121 PetscSFPackOpt opt = NULL; 1122 1123 PetscFunctionBegin; 1124 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1125 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1126 if (UnpackAndOp) { 1127 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1128 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1129 } else { 1130 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1131 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1132 } 1133 } 1134 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1135 PetscFunctionReturn(0); 1136 } 1137 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1138 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1139 { 1140 PetscErrorCode ierr; 1141 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1142 1143 PetscFunctionBegin; 1144 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1145 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1146 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1147 if (scope == PETSCSF_REMOTE) { 1148 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1149 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1150 } 1151 PetscFunctionReturn(0); 1152 } 1153 1154 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1155 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1156 { 1157 PetscErrorCode ierr; 1158 1159 PetscFunctionBegin; 1160 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1161 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);} 1162 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1163 if (scope == PETSCSF_REMOTE) { 1164 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1165 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1166 } 1167 PetscFunctionReturn(0); 1168 } 1169 1170 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1171 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1172 { 1173 PetscErrorCode ierr; 1174 const PetscInt *rootindices = NULL; 1175 PetscInt count,start; 1176 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1177 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1178 PetscMemType rootmtype = link->rootmtype; 1179 PetscSFPackOpt opt = NULL; 1180 1181 PetscFunctionBegin; 1182 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1183 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1184 /* Do FetchAndOp on rootdata with rootbuf */ 1185 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 1186 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1187 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1188 } 1189 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1190 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1191 PetscFunctionReturn(0); 1192 } 1193 1194 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1195 { 1196 PetscErrorCode ierr; 1197 const PetscInt *rootindices = NULL,*leafindices = NULL; 1198 PetscInt count,rootstart,leafstart; 1199 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1200 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1201 PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1202 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1203 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1204 char *srcbuf = NULL,*dstbuf = NULL; 1205 PetscBool dstdups; 1206 1207 PetscFunctionBegin; 1208 if (!buflen) PetscFunctionReturn(0); 1209 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1210 if (direction == PETSCSF_ROOT2LEAF) { 1211 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1212 srcmtype = rootmtype; 1213 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1214 dstmtype = leafmtype; 1215 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1216 } else { 1217 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1218 srcmtype = leafmtype; 1219 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1220 dstmtype = rootmtype; 1221 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1222 } 1223 ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 1224 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1225 if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 1226 if (direction == PETSCSF_ROOT2LEAF) { 1227 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1228 } else { 1229 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1230 } 1231 } else { 1232 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1233 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1234 ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1235 if (ScatterAndOp) { 1236 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1237 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1238 if (direction == PETSCSF_ROOT2LEAF) { 1239 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1240 } else { 1241 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1242 } 1243 } else { 1244 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1245 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1246 if (direction == PETSCSF_ROOT2LEAF) { 1247 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1248 } else { 1249 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1250 } 1251 } 1252 } 1253 PetscFunctionReturn(0); 1254 } 1255 1256 /* Fetch rootdata to leafdata and leafupdate locally */ 1257 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1258 { 1259 PetscErrorCode ierr; 1260 const PetscInt *rootindices = NULL,*leafindices = NULL; 1261 PetscInt count,rootstart,leafstart; 1262 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1263 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1264 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1265 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1266 1267 PetscFunctionBegin; 1268 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1269 if (rootmtype != leafmtype) { 1270 /* The local communication has to go through pack and unpack */ 1271 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1272 } else { 1273 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1274 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1275 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1276 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1277 } 1278 PetscFunctionReturn(0); 1279 } 1280 1281 /* 1282 Create per-rank pack/unpack optimizations based on indice patterns 1283 1284 Input Parameters: 1285 + n - Number of destination ranks 1286 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1287 - idx - [*] Array storing indices 1288 1289 Output Parameters: 1290 + opt - Pack optimizations. NULL if no optimizations. 1291 */ 1292 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1293 { 1294 PetscErrorCode ierr; 1295 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1296 PetscBool optimizable = PETSC_TRUE; 1297 PetscSFPackOpt opt; 1298 1299 PetscFunctionBegin; 1300 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1301 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1302 opt->n = opt->array[0] = n; 1303 opt->offset = opt->array + 1; 1304 opt->start = opt->array + n + 2; 1305 opt->dx = opt->array + 2*n + 2; 1306 opt->dy = opt->array + 3*n + 2; 1307 opt->dz = opt->array + 4*n + 2; 1308 opt->X = opt->array + 5*n + 2; 1309 opt->Y = opt->array + 6*n + 2; 1310 1311 for (r=0; r<n; r++) { /* For each destination rank */ 1312 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1313 p = offset[r]; 1314 start = idx[p]; /* First index for this rank */ 1315 p++; 1316 1317 /* Search in X dimension */ 1318 for (dx=1; dx<m; dx++,p++) { 1319 if (start+dx != idx[p]) break; 1320 } 1321 1322 dydz = m/dx; 1323 X = dydz > 1 ? (idx[p]-start) : dx; 1324 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1325 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1326 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1327 for (i=0; i<dx; i++,p++) { 1328 if (start+X*dy+i != idx[p]) { 1329 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1330 else goto Z_dimension; 1331 } 1332 } 1333 } 1334 1335 Z_dimension: 1336 dz = m/(dx*dy); 1337 Y = dz > 1 ? (idx[p]-start)/X : dy; 1338 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1339 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1340 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1341 for (j=0; j<dy; j++) { 1342 for (i=0; i<dx; i++,p++) { 1343 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1344 } 1345 } 1346 } 1347 opt->start[r] = start; 1348 opt->dx[r] = dx; 1349 opt->dy[r] = dy; 1350 opt->dz[r] = dz; 1351 opt->X[r] = X; 1352 opt->Y[r] = Y; 1353 } 1354 1355 finish: 1356 /* If not optimizable, free arrays to save memory */ 1357 if (!n || !optimizable) { 1358 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1359 ierr = PetscFree(opt);CHKERRQ(ierr); 1360 *out = NULL; 1361 } else { 1362 opt->offset[0] = 0; 1363 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1364 *out = opt; 1365 } 1366 PetscFunctionReturn(0); 1367 } 1368 1369 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1370 { 1371 PetscErrorCode ierr; 1372 PetscSFPackOpt opt = *out; 1373 1374 PetscFunctionBegin; 1375 if (opt) { 1376 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1377 ierr = PetscFree(opt);CHKERRQ(ierr); 1378 *out = NULL; 1379 } 1380 PetscFunctionReturn(0); 1381 } 1382 1383 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1384 { 1385 PetscErrorCode ierr; 1386 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1387 PetscInt i,j; 1388 1389 PetscFunctionBegin; 1390 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1391 for (i=0; i<2; i++) { /* Set defaults */ 1392 sf->leafstart[i] = 0; 1393 sf->leafcontig[i] = PETSC_TRUE; 1394 sf->leafdups[i] = PETSC_FALSE; 1395 bas->rootstart[i] = 0; 1396 bas->rootcontig[i] = PETSC_TRUE; 1397 bas->rootdups[i] = PETSC_FALSE; 1398 } 1399 1400 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1401 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1402 1403 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1404 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1405 1406 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1407 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1408 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1409 } 1410 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1411 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1412 } 1413 1414 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1415 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1416 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1417 1418 /* Are root indices for self and remote contiguous? */ 1419 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1420 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1421 1422 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1423 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1424 1425 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1426 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1427 } 1428 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1429 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1430 } 1431 1432 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1433 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1434 1435 #if defined(PETSC_HAVE_DEVICE) 1436 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1437 if (PetscDefined(HAVE_DEVICE)) { 1438 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1439 if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1440 if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1441 if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1442 if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1443 } 1444 #endif 1445 PetscFunctionReturn(0); 1446 } 1447 1448 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1449 { 1450 PetscErrorCode ierr; 1451 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1452 PetscInt i; 1453 1454 PetscFunctionBegin; 1455 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1456 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1457 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1458 #if defined(PETSC_HAVE_DEVICE) 1459 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1460 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1461 #endif 1462 } 1463 PetscFunctionReturn(0); 1464 } 1465