1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19 /* Ref MPI MAXLOC */ 20 #define OP_XLOC(op,s,t) \ 21 do { \ 22 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23 else if (!((s).u op (t).u)) s = t; \ 24 } while (0) 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 .BS Block size for vectorization. It is a factor of bsz. 31 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 32 33 Arguments of the Pack routine: 34 +count Number of indices in idx[]. 35 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36 .opt Per-pack optimization plan. NULL means no such plan. 37 .idx Indices of entries to packed. 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40 -packed Address of the packed data. 41 */ 42 #define DEF_PackFunc(Type,BS,EQ) \ 43 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44 { \ 45 PetscErrorCode ierr; \ 46 const Type *u = (const Type*)unpacked,*u2; \ 47 Type *p = (Type*)packed,*p2; \ 48 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 49 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 50 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 51 PetscFunctionBegin; \ 52 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 53 else if (opt) { /* has optimizations available */ \ 54 p2 = p; \ 55 for (r=0; r<opt->n; r++) { \ 56 u2 = u + opt->start[r]*MBS; \ 57 X = opt->X[r]; \ 58 Y = opt->Y[r]; \ 59 for (k=0; k<opt->dz[r]; k++) \ 60 for (j=0; j<opt->dy[r]; j++) { \ 61 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 62 p2 += opt->dx[r]*MBS; \ 63 } \ 64 } \ 65 } else { \ 66 for (i=0; i<count; i++) \ 67 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 68 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 69 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 70 } \ 71 PetscFunctionReturn(0); \ 72 } 73 74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 75 and inserts into a sparse array. 76 77 Arguments: 78 .Type Type of the data 79 .BS Block size for vectorization 80 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 81 82 Notes: 83 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 84 */ 85 #define DEF_UnpackFunc(Type,BS,EQ) \ 86 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 87 { \ 88 PetscErrorCode ierr; \ 89 Type *u = (Type*)unpacked,*u2; \ 90 const Type *p = (const Type*)packed; \ 91 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 92 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 93 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 94 PetscFunctionBegin; \ 95 if (!idx) { \ 96 u += start*MBS; \ 97 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 98 } else if (opt) { /* has optimizations available */ \ 99 for (r=0; r<opt->n; r++) { \ 100 u2 = u + opt->start[r]*MBS; \ 101 X = opt->X[r]; \ 102 Y = opt->Y[r]; \ 103 for (k=0; k<opt->dz[r]; k++) \ 104 for (j=0; j<opt->dy[r]; j++) { \ 105 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 106 p += opt->dx[r]*MBS; \ 107 } \ 108 } \ 109 } else { \ 110 for (i=0; i<count; i++) \ 111 for (j=0; j<M; j++) \ 112 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 113 } \ 114 PetscFunctionReturn(0); \ 115 } 116 117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 118 119 Arguments: 120 +Opname Name of the Op, such as Add, Mult, LAND, etc. 121 .Type Type of the data 122 .BS Block size for vectorization 123 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 124 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 125 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 126 */ 127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 128 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 129 { \ 130 Type *u = (Type*)unpacked,*u2; \ 131 const Type *p = (const Type*)packed; \ 132 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 133 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 134 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 135 PetscFunctionBegin; \ 136 if (!idx) { \ 137 u += start*MBS; \ 138 for (i=0; i<count; i++) \ 139 for (j=0; j<M; j++) \ 140 for (k=0; k<BS; k++) \ 141 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 142 } else if (opt) { /* idx[] has patterns */ \ 143 for (r=0; r<opt->n; r++) { \ 144 u2 = u + opt->start[r]*MBS; \ 145 X = opt->X[r]; \ 146 Y = opt->Y[r]; \ 147 for (k=0; k<opt->dz[r]; k++) \ 148 for (j=0; j<opt->dy[r]; j++) { \ 149 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 150 p += opt->dx[r]*MBS; \ 151 } \ 152 } \ 153 } else { \ 154 for (i=0; i<count; i++) \ 155 for (j=0; j<M; j++) \ 156 for (k=0; k<BS; k++) \ 157 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 158 } \ 159 PetscFunctionReturn(0); \ 160 } 161 162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 163 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 164 { \ 165 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 166 PetscInt i,j,k,r,l,bs=link->bs; \ 167 const PetscInt M = (EQ) ? 1 : bs/BS; \ 168 const PetscInt MBS = M*BS; \ 169 PetscFunctionBegin; \ 170 for (i=0; i<count; i++) { \ 171 r = (!idx ? start+i : idx[i])*MBS; \ 172 l = i*MBS; \ 173 for (j=0; j<M; j++) \ 174 for (k=0; k<BS; k++) { \ 175 tmp = u[r+j*BS+k]; \ 176 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 177 p[l+j*BS+k] = tmp; \ 178 } \ 179 } \ 180 PetscFunctionReturn(0); \ 181 } 182 183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 184 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 185 { \ 186 PetscErrorCode ierr; \ 187 const Type *u = (const Type*)src; \ 188 Type *v = (Type*)dst; \ 189 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 190 const PetscInt M = (EQ) ? 1 : bs/BS; \ 191 const PetscInt MBS = M*BS; \ 192 PetscFunctionBegin; \ 193 if (!srcIdx) { /* src is contiguous */ \ 194 u += srcStart*MBS; \ 195 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 196 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 197 u += srcOpt->start[0]*MBS; \ 198 v += dstStart*MBS; \ 199 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 200 for (k=0; k<srcOpt->dz[0]; k++) \ 201 for (j=0; j<srcOpt->dy[0]; j++) { \ 202 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 203 v += srcOpt->dx[0]*MBS; \ 204 } \ 205 } else { /* all other cases */ \ 206 for (i=0; i<count; i++) { \ 207 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 208 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 209 for (j=0; j<M; j++) \ 210 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 211 } \ 212 } \ 213 PetscFunctionReturn(0); \ 214 } 215 216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 217 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 218 { \ 219 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 220 const Type *ldata = (const Type*)leafdata; \ 221 PetscInt i,j,k,r,l,bs = link->bs; \ 222 const PetscInt M = (EQ) ? 1 : bs/BS; \ 223 const PetscInt MBS = M*BS; \ 224 PetscFunctionBegin; \ 225 for (i=0; i<count; i++) { \ 226 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 227 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 228 for (j=0; j<M; j++) \ 229 for (k=0; k<BS; k++) { \ 230 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 231 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 232 } \ 233 } \ 234 PetscFunctionReturn(0); \ 235 } 236 237 /* Pack, Unpack/Fetch ops */ 238 #define DEF_Pack(Type,BS,EQ) \ 239 DEF_PackFunc(Type,BS,EQ) \ 240 DEF_UnpackFunc(Type,BS,EQ) \ 241 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 242 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 243 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 244 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 245 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 246 } 247 248 /* Add, Mult ops */ 249 #define DEF_Add(Type,BS,EQ) \ 250 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 253 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 256 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type,BS,EQ) \ 267 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 268 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 269 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 270 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 271 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 272 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 273 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 274 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 275 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 276 } 277 278 /* Logical ops. 279 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 280 the compilation warning "empty macro arguments are undefined in ISO C90" 281 */ 282 #define DEF_Log(Type,BS,EQ) \ 283 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 284 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 285 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 286 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 287 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 288 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 289 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 291 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 292 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 293 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 294 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 295 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 296 } 297 298 /* Bitwise ops */ 299 #define DEF_Bit(Type,BS,EQ) \ 300 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 301 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 302 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 303 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 304 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 305 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 306 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 307 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 308 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 309 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 310 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 311 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 312 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 313 } 314 315 /* Maxloc, Minloc ops */ 316 #define DEF_Xloc(Type,BS,EQ) \ 317 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 318 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 319 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 320 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 321 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 322 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 323 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 324 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 325 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 326 } 327 328 #define DEF_IntegerType(Type,BS,EQ) \ 329 DEF_Pack(Type,BS,EQ) \ 330 DEF_Add(Type,BS,EQ) \ 331 DEF_Cmp(Type,BS,EQ) \ 332 DEF_Log(Type,BS,EQ) \ 333 DEF_Bit(Type,BS,EQ) \ 334 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 335 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 336 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 337 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 338 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 340 } 341 342 #define DEF_RealType(Type,BS,EQ) \ 343 DEF_Pack(Type,BS,EQ) \ 344 DEF_Add(Type,BS,EQ) \ 345 DEF_Cmp(Type,BS,EQ) \ 346 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 347 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 348 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 349 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 350 } 351 352 #if defined(PETSC_HAVE_COMPLEX) 353 #define DEF_ComplexType(Type,BS,EQ) \ 354 DEF_Pack(Type,BS,EQ) \ 355 DEF_Add(Type,BS,EQ) \ 356 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 357 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 358 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 359 } 360 #endif 361 362 #define DEF_DumbType(Type,BS,EQ) \ 363 DEF_Pack(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 } 367 368 /* Maxloc, Minloc */ 369 #define DEF_PairType(Type,BS,EQ) \ 370 DEF_Pack(Type,BS,EQ) \ 371 DEF_Xloc(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 375 } 376 377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 385 386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 387 DEF_IntegerType(int,1,1) 388 DEF_IntegerType(int,2,1) 389 DEF_IntegerType(int,4,1) 390 DEF_IntegerType(int,8,1) 391 DEF_IntegerType(int,1,0) 392 DEF_IntegerType(int,2,0) 393 DEF_IntegerType(int,4,0) 394 DEF_IntegerType(int,8,0) 395 #endif 396 397 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 398 typedef signed char SignedChar; 399 DEF_IntegerType(SignedChar,1,1) 400 DEF_IntegerType(SignedChar,2,1) 401 DEF_IntegerType(SignedChar,4,1) 402 DEF_IntegerType(SignedChar,8,1) 403 DEF_IntegerType(SignedChar,1,0) 404 DEF_IntegerType(SignedChar,2,0) 405 DEF_IntegerType(SignedChar,4,0) 406 DEF_IntegerType(SignedChar,8,0) 407 408 typedef unsigned char UnsignedChar; 409 DEF_IntegerType(UnsignedChar,1,1) 410 DEF_IntegerType(UnsignedChar,2,1) 411 DEF_IntegerType(UnsignedChar,4,1) 412 DEF_IntegerType(UnsignedChar,8,1) 413 DEF_IntegerType(UnsignedChar,1,0) 414 DEF_IntegerType(UnsignedChar,2,0) 415 DEF_IntegerType(UnsignedChar,4,0) 416 DEF_IntegerType(UnsignedChar,8,0) 417 418 DEF_RealType(PetscReal,1,1) 419 DEF_RealType(PetscReal,2,1) 420 DEF_RealType(PetscReal,4,1) 421 DEF_RealType(PetscReal,8,1) 422 DEF_RealType(PetscReal,1,0) 423 DEF_RealType(PetscReal,2,0) 424 DEF_RealType(PetscReal,4,0) 425 DEF_RealType(PetscReal,8,0) 426 427 #if defined(PETSC_HAVE_COMPLEX) 428 DEF_ComplexType(PetscComplex,1,1) 429 DEF_ComplexType(PetscComplex,2,1) 430 DEF_ComplexType(PetscComplex,4,1) 431 DEF_ComplexType(PetscComplex,8,1) 432 DEF_ComplexType(PetscComplex,1,0) 433 DEF_ComplexType(PetscComplex,2,0) 434 DEF_ComplexType(PetscComplex,4,0) 435 DEF_ComplexType(PetscComplex,8,0) 436 #endif 437 438 #define PairType(Type1,Type2) Type1##_##Type2 439 typedef struct {int u; int i;} PairType(int,int); 440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 441 DEF_PairType(PairType(int,int),1,1) 442 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 443 444 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 445 DEF_DumbType(char,1,1) 446 DEF_DumbType(char,2,1) 447 DEF_DumbType(char,4,1) 448 DEF_DumbType(char,1,0) 449 DEF_DumbType(char,2,0) 450 DEF_DumbType(char,4,0) 451 452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 453 DEF_DumbType(DumbInt,1,1) 454 DEF_DumbType(DumbInt,2,1) 455 DEF_DumbType(DumbInt,4,1) 456 DEF_DumbType(DumbInt,8,1) 457 DEF_DumbType(DumbInt,1,0) 458 DEF_DumbType(DumbInt,2,0) 459 DEF_DumbType(DumbInt,4,0) 460 DEF_DumbType(DumbInt,8,0) 461 462 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 463 { 464 PetscErrorCode ierr; 465 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 466 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 467 468 PetscFunctionBegin; 469 /* Destroy device-specific fields */ 470 if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 471 472 /* Destroy host related fields */ 473 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 474 if (!link->use_nvshmem) { 475 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 476 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 477 } 478 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 479 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 480 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 481 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 482 } 483 } 484 ierr = PetscFree(link);CHKERRQ(ierr); 485 PetscFunctionReturn(0); 486 } 487 488 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 489 { 490 PetscErrorCode ierr; 491 492 PetscFunctionBegin; 493 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 494 #if defined(PETSC_HAVE_NVSHMEM) 495 { 496 PetscBool use_nvshmem; 497 ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 498 if (use_nvshmem) { 499 ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 500 PetscFunctionReturn(0); 501 } 502 } 503 #endif 504 ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 505 PetscFunctionReturn(0); 506 } 507 508 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 509 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 510 */ 511 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 512 { 513 PetscErrorCode ierr; 514 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 515 PetscInt i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks; 516 const PetscInt *rootoffset,*leafoffset; 517 MPI_Aint disp; 518 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 519 MPI_Datatype unit = link->unit; 520 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 521 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 522 523 PetscFunctionBegin; 524 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 525 if (sf->persistent) { 526 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 527 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 528 if (direction == PETSCSF_LEAF2ROOT) { 529 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 530 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 531 cnt = rootoffset[i+1]-rootoffset[i]; 532 ierr = MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 533 } 534 } else { /* PETSCSF_ROOT2LEAF */ 535 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 536 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 537 cnt = rootoffset[i+1]-rootoffset[i]; 538 ierr = MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 539 } 540 } 541 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 542 } 543 544 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 545 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 546 if (direction == PETSCSF_LEAF2ROOT) { 547 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 548 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 549 cnt = leafoffset[i+1]-leafoffset[i]; 550 ierr = MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 551 } 552 } else { /* PETSCSF_ROOT2LEAF */ 553 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 554 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 555 cnt = leafoffset[i+1]-leafoffset[i]; 556 ierr = MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 557 } 558 } 559 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 560 } 561 } 562 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 563 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 564 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 565 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 566 PetscFunctionReturn(0); 567 } 568 569 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 570 { 571 PetscErrorCode ierr; 572 PetscSFLink link,*p; 573 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 574 575 PetscFunctionBegin; 576 /* Look for types in cache */ 577 for (p=&bas->inuse; (link=*p); p=&link->next) { 578 PetscBool match; 579 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 580 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 581 switch (cmode) { 582 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 583 case PETSC_USE_POINTER: break; 584 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 585 } 586 *mylink = link; 587 PetscFunctionReturn(0); 588 } 589 } 590 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 591 } 592 593 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 594 { 595 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 596 PetscSFLink link = *mylink; 597 598 PetscFunctionBegin; 599 link->rootdata = NULL; 600 link->leafdata = NULL; 601 link->next = bas->avail; 602 bas->avail = link; 603 *mylink = NULL; 604 PetscFunctionReturn(0); 605 } 606 607 /* Error out on unsupported overlapped communications */ 608 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 609 { 610 PetscErrorCode ierr; 611 PetscSFLink link,*p; 612 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 613 PetscBool match; 614 615 PetscFunctionBegin; 616 if (PetscDefined(USE_DEBUG)) { 617 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 618 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 619 */ 620 if (rootdata || leafdata) { 621 for (p=&bas->inuse; (link=*p); p=&link->next) { 622 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 623 PetscCheckFalse(match && (rootdata == link->rootdata) && (leafdata == link->leafdata),PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 624 } 625 } 626 } 627 PetscFunctionReturn(0); 628 } 629 630 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 631 { 632 PetscFunctionBegin; 633 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 634 PetscFunctionReturn(0); 635 } 636 637 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 638 { 639 PetscErrorCode ierr; 640 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 641 PetscBool is2Int,is2PetscInt; 642 PetscMPIInt ni,na,nd,combiner; 643 #if defined(PETSC_HAVE_COMPLEX) 644 PetscInt nPetscComplex=0; 645 #endif 646 647 PetscFunctionBegin; 648 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 649 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 650 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 651 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 652 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 653 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 654 #if defined(PETSC_HAVE_COMPLEX) 655 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 656 #endif 657 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 658 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 659 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 660 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 661 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 662 link->bs = 1; /* default */ 663 664 if (is2Int) { 665 PackInit_PairType_int_int_1_1(link); 666 link->bs = 1; 667 link->unitbytes = 2*sizeof(int); 668 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 669 link->basicunit = MPI_2INT; 670 link->unit = MPI_2INT; 671 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 672 PackInit_PairType_PetscInt_PetscInt_1_1(link); 673 link->bs = 1; 674 link->unitbytes = 2*sizeof(PetscInt); 675 link->basicunit = MPIU_2INT; 676 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 677 link->unit = MPIU_2INT; 678 } else if (nPetscReal) { 679 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 680 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 681 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 682 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 683 link->bs = nPetscReal; 684 link->unitbytes = nPetscReal*sizeof(PetscReal); 685 link->basicunit = MPIU_REAL; 686 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 687 } else if (nPetscInt) { 688 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 689 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 690 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 691 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 692 link->bs = nPetscInt; 693 link->unitbytes = nPetscInt*sizeof(PetscInt); 694 link->basicunit = MPIU_INT; 695 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 696 #if defined(PETSC_USE_64BIT_INDICES) 697 } else if (nInt) { 698 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 699 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 700 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 701 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 702 link->bs = nInt; 703 link->unitbytes = nInt*sizeof(int); 704 link->basicunit = MPI_INT; 705 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 706 #endif 707 } else if (nSignedChar) { 708 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 709 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 710 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 711 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 712 link->bs = nSignedChar; 713 link->unitbytes = nSignedChar*sizeof(SignedChar); 714 link->basicunit = MPI_SIGNED_CHAR; 715 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 716 } else if (nUnsignedChar) { 717 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 718 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 719 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 720 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 721 link->bs = nUnsignedChar; 722 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 723 link->basicunit = MPI_UNSIGNED_CHAR; 724 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 725 #if defined(PETSC_HAVE_COMPLEX) 726 } else if (nPetscComplex) { 727 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 728 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 729 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 730 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 731 link->bs = nPetscComplex; 732 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 733 link->basicunit = MPIU_COMPLEX; 734 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 735 #endif 736 } else { 737 MPI_Aint lb,nbyte; 738 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 739 PetscCheckFalse(lb != 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb); 740 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 741 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 742 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 743 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 744 link->bs = nbyte; 745 link->unitbytes = nbyte; 746 link->basicunit = MPI_BYTE; 747 } else { 748 nInt = nbyte / sizeof(int); 749 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 750 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 751 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 752 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 753 link->bs = nInt; 754 link->unitbytes = nbyte; 755 link->basicunit = MPI_INT; 756 } 757 if (link->isbuiltin) link->unit = unit; 758 } 759 760 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 761 762 link->Memcpy = PetscSFLinkMemcpy_Host; 763 PetscFunctionReturn(0); 764 } 765 766 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 767 { 768 PetscFunctionBegin; 769 *UnpackAndOp = NULL; 770 if (PetscMemTypeHost(mtype)) { 771 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 772 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 773 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 774 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 775 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 776 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 777 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 778 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 779 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 780 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 781 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 782 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 783 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 784 } 785 #if defined(PETSC_HAVE_DEVICE) 786 else if (PetscMemTypeDevice(mtype) && !atomic) { 787 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 788 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 789 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 790 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 791 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 792 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 793 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 794 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 795 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 796 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 797 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 798 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 799 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 800 } else if (PetscMemTypeDevice(mtype) && atomic) { 801 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 802 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 803 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 804 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 805 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 806 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 807 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 808 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 809 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 810 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 811 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 812 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 813 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 814 } 815 #endif 816 PetscFunctionReturn(0); 817 } 818 819 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 820 { 821 PetscFunctionBegin; 822 *ScatterAndOp = NULL; 823 if (PetscMemTypeHost(mtype)) { 824 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 825 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 826 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 827 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 828 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 829 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 830 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 831 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 832 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 833 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 834 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 835 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 836 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 837 } 838 #if defined(PETSC_HAVE_DEVICE) 839 else if (PetscMemTypeDevice(mtype) && !atomic) { 840 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 841 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 842 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 843 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 844 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 845 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 846 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 847 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 848 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 849 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 850 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 851 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 852 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 853 } else if (PetscMemTypeDevice(mtype) && atomic) { 854 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 855 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 856 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 857 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 858 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 859 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 860 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 861 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 862 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 863 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 864 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 865 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 866 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 867 } 868 #endif 869 PetscFunctionReturn(0); 870 } 871 872 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 873 { 874 PetscFunctionBegin; 875 *FetchAndOp = NULL; 876 PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 877 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 878 #if defined(PETSC_HAVE_DEVICE) 879 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 880 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 881 #endif 882 PetscFunctionReturn(0); 883 } 884 885 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 886 { 887 PetscFunctionBegin; 888 *FetchAndOpLocal = NULL; 889 PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 890 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 891 #if defined(PETSC_HAVE_DEVICE) 892 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 893 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 894 #endif 895 PetscFunctionReturn(0); 896 } 897 898 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 899 { 900 PetscErrorCode ierr; 901 PetscLogDouble flops; 902 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 903 904 PetscFunctionBegin; 905 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 906 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 907 #if defined(PETSC_HAVE_DEVICE) 908 if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 909 #endif 910 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 911 } 912 PetscFunctionReturn(0); 913 } 914 915 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 916 { 917 PetscLogDouble flops; 918 PetscErrorCode ierr; 919 920 PetscFunctionBegin; 921 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 922 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 923 #if defined(PETSC_HAVE_DEVICE) 924 if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 925 #endif 926 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 927 } 928 PetscFunctionReturn(0); 929 } 930 931 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 932 Input Parameters: 933 +sf - The StarForest 934 .link - The link 935 .count - Number of entries to unpack 936 .start - The first index, significent when indices=NULL 937 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 938 .buf - A contiguous buffer to unpack from 939 -op - Operation after unpack 940 941 Output Parameters: 942 .data - The data to unpack to 943 */ 944 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 945 { 946 PetscFunctionBegin; 947 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 948 { 949 PetscErrorCode ierr; 950 PetscInt i; 951 if (indices) { 952 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 953 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 954 */ 955 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 956 } else { 957 ierr = MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op);CHKERRMPI(ierr); 958 } 959 } 960 PetscFunctionReturn(0); 961 #else 962 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 963 #endif 964 } 965 966 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 967 { 968 PetscFunctionBegin; 969 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 970 { 971 PetscErrorCode ierr; 972 PetscInt i,disp; 973 if (!srcIdx) { 974 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 975 } else { 976 for (i=0; i<count; i++) { 977 disp = dstIdx? dstIdx[i] : dstStart + i; 978 ierr = MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 979 } 980 } 981 } 982 PetscFunctionReturn(0); 983 #else 984 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 985 #endif 986 } 987 988 /*============================================================================= 989 Pack/Unpack/Fetch/Scatter routines 990 ============================================================================*/ 991 992 /* Pack rootdata to rootbuf 993 Input Parameters: 994 + sf - The SF this packing works on. 995 . link - It gives the memtype of the roots and also provides root buffer. 996 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 997 - rootdata - Where to read the roots. 998 999 Notes: 1000 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1001 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1002 */ 1003 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1004 { 1005 PetscErrorCode ierr; 1006 const PetscInt *rootindices = NULL; 1007 PetscInt count,start; 1008 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1009 PetscMemType rootmtype = link->rootmtype; 1010 PetscSFPackOpt opt = NULL; 1011 1012 PetscFunctionBegin; 1013 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1014 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1015 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1016 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1017 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1018 } 1019 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1020 PetscFunctionReturn(0); 1021 } 1022 1023 /* Pack leafdata to leafbuf */ 1024 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1025 { 1026 PetscErrorCode ierr; 1027 const PetscInt *leafindices = NULL; 1028 PetscInt count,start; 1029 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1030 PetscMemType leafmtype = link->leafmtype; 1031 PetscSFPackOpt opt = NULL; 1032 1033 PetscFunctionBegin; 1034 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1035 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1036 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1037 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1038 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1039 } 1040 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1041 PetscFunctionReturn(0); 1042 } 1043 1044 /* Pack rootdata to rootbuf, which are in the same memory space */ 1045 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1046 { 1047 PetscErrorCode ierr; 1048 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1049 1050 PetscFunctionBegin; 1051 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1052 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1053 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1054 } 1055 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1056 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 1057 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1058 PetscFunctionReturn(0); 1059 } 1060 /* Pack leafdata to leafbuf, which are in the same memory space */ 1061 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1062 { 1063 PetscErrorCode ierr; 1064 1065 PetscFunctionBegin; 1066 if (scope == PETSCSF_REMOTE) { 1067 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1068 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1069 } 1070 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1071 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 1072 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1073 PetscFunctionReturn(0); 1074 } 1075 1076 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1077 { 1078 PetscErrorCode ierr; 1079 const PetscInt *rootindices = NULL; 1080 PetscInt count,start; 1081 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1082 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1083 PetscMemType rootmtype = link->rootmtype; 1084 PetscSFPackOpt opt = NULL; 1085 1086 PetscFunctionBegin; 1087 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1088 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1089 if (UnpackAndOp) { 1090 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1091 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1092 } else { 1093 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1094 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1095 } 1096 } 1097 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1098 PetscFunctionReturn(0); 1099 } 1100 1101 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1102 { 1103 PetscErrorCode ierr; 1104 const PetscInt *leafindices = NULL; 1105 PetscInt count,start; 1106 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1107 PetscMemType leafmtype = link->leafmtype; 1108 PetscSFPackOpt opt = NULL; 1109 1110 PetscFunctionBegin; 1111 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1112 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1113 if (UnpackAndOp) { 1114 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1115 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1116 } else { 1117 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1118 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1119 } 1120 } 1121 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1125 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1126 { 1127 PetscErrorCode ierr; 1128 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1129 1130 PetscFunctionBegin; 1131 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1132 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1133 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1134 if (scope == PETSCSF_REMOTE) { 1135 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1136 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1137 } 1138 PetscFunctionReturn(0); 1139 } 1140 1141 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1142 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1143 { 1144 PetscErrorCode ierr; 1145 1146 PetscFunctionBegin; 1147 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1148 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);} 1149 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1150 if (scope == PETSCSF_REMOTE) { 1151 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1152 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1153 } 1154 PetscFunctionReturn(0); 1155 } 1156 1157 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1158 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1159 { 1160 PetscErrorCode ierr; 1161 const PetscInt *rootindices = NULL; 1162 PetscInt count,start; 1163 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1164 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1165 PetscMemType rootmtype = link->rootmtype; 1166 PetscSFPackOpt opt = NULL; 1167 1168 PetscFunctionBegin; 1169 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1170 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1171 /* Do FetchAndOp on rootdata with rootbuf */ 1172 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 1173 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1174 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1175 } 1176 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1177 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1182 { 1183 PetscErrorCode ierr; 1184 const PetscInt *rootindices = NULL,*leafindices = NULL; 1185 PetscInt count,rootstart,leafstart; 1186 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1187 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1188 PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1189 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1190 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1191 char *srcbuf = NULL,*dstbuf = NULL; 1192 PetscBool dstdups; 1193 1194 PetscFunctionBegin; 1195 if (!buflen) PetscFunctionReturn(0); 1196 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1197 if (direction == PETSCSF_ROOT2LEAF) { 1198 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1199 srcmtype = rootmtype; 1200 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1201 dstmtype = leafmtype; 1202 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1203 } else { 1204 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1205 srcmtype = leafmtype; 1206 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1207 dstmtype = rootmtype; 1208 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1209 } 1210 ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 1211 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1212 if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 1213 if (direction == PETSCSF_ROOT2LEAF) { 1214 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1215 } else { 1216 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1217 } 1218 } else { 1219 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1220 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1221 ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1222 if (ScatterAndOp) { 1223 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1224 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1225 if (direction == PETSCSF_ROOT2LEAF) { 1226 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1227 } else { 1228 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1229 } 1230 } else { 1231 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1232 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1233 if (direction == PETSCSF_ROOT2LEAF) { 1234 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1235 } else { 1236 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1237 } 1238 } 1239 } 1240 PetscFunctionReturn(0); 1241 } 1242 1243 /* Fetch rootdata to leafdata and leafupdate locally */ 1244 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1245 { 1246 PetscErrorCode ierr; 1247 const PetscInt *rootindices = NULL,*leafindices = NULL; 1248 PetscInt count,rootstart,leafstart; 1249 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1250 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1251 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1252 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1253 1254 PetscFunctionBegin; 1255 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1256 if (rootmtype != leafmtype) { 1257 /* The local communication has to go through pack and unpack */ 1258 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1259 } else { 1260 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1261 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1262 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1263 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1264 } 1265 PetscFunctionReturn(0); 1266 } 1267 1268 /* 1269 Create per-rank pack/unpack optimizations based on indice patterns 1270 1271 Input Parameters: 1272 + n - Number of destination ranks 1273 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1274 - idx - [*] Array storing indices 1275 1276 Output Parameters: 1277 + opt - Pack optimizations. NULL if no optimizations. 1278 */ 1279 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1280 { 1281 PetscErrorCode ierr; 1282 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1283 PetscBool optimizable = PETSC_TRUE; 1284 PetscSFPackOpt opt; 1285 1286 PetscFunctionBegin; 1287 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1288 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1289 opt->n = opt->array[0] = n; 1290 opt->offset = opt->array + 1; 1291 opt->start = opt->array + n + 2; 1292 opt->dx = opt->array + 2*n + 2; 1293 opt->dy = opt->array + 3*n + 2; 1294 opt->dz = opt->array + 4*n + 2; 1295 opt->X = opt->array + 5*n + 2; 1296 opt->Y = opt->array + 6*n + 2; 1297 1298 for (r=0; r<n; r++) { /* For each destination rank */ 1299 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1300 p = offset[r]; 1301 start = idx[p]; /* First index for this rank */ 1302 p++; 1303 1304 /* Search in X dimension */ 1305 for (dx=1; dx<m; dx++,p++) { 1306 if (start+dx != idx[p]) break; 1307 } 1308 1309 dydz = m/dx; 1310 X = dydz > 1 ? (idx[p]-start) : dx; 1311 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1312 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1313 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1314 for (i=0; i<dx; i++,p++) { 1315 if (start+X*dy+i != idx[p]) { 1316 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1317 else goto Z_dimension; 1318 } 1319 } 1320 } 1321 1322 Z_dimension: 1323 dz = m/(dx*dy); 1324 Y = dz > 1 ? (idx[p]-start)/X : dy; 1325 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1326 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1327 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1328 for (j=0; j<dy; j++) { 1329 for (i=0; i<dx; i++,p++) { 1330 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1331 } 1332 } 1333 } 1334 opt->start[r] = start; 1335 opt->dx[r] = dx; 1336 opt->dy[r] = dy; 1337 opt->dz[r] = dz; 1338 opt->X[r] = X; 1339 opt->Y[r] = Y; 1340 } 1341 1342 finish: 1343 /* If not optimizable, free arrays to save memory */ 1344 if (!n || !optimizable) { 1345 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1346 ierr = PetscFree(opt);CHKERRQ(ierr); 1347 *out = NULL; 1348 } else { 1349 opt->offset[0] = 0; 1350 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1351 *out = opt; 1352 } 1353 PetscFunctionReturn(0); 1354 } 1355 1356 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1357 { 1358 PetscErrorCode ierr; 1359 PetscSFPackOpt opt = *out; 1360 1361 PetscFunctionBegin; 1362 if (opt) { 1363 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1364 ierr = PetscFree(opt);CHKERRQ(ierr); 1365 *out = NULL; 1366 } 1367 PetscFunctionReturn(0); 1368 } 1369 1370 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1371 { 1372 PetscErrorCode ierr; 1373 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1374 PetscInt i,j; 1375 1376 PetscFunctionBegin; 1377 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1378 for (i=0; i<2; i++) { /* Set defaults */ 1379 sf->leafstart[i] = 0; 1380 sf->leafcontig[i] = PETSC_TRUE; 1381 sf->leafdups[i] = PETSC_FALSE; 1382 bas->rootstart[i] = 0; 1383 bas->rootcontig[i] = PETSC_TRUE; 1384 bas->rootdups[i] = PETSC_FALSE; 1385 } 1386 1387 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1388 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1389 1390 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1391 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1392 1393 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1394 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1395 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1396 } 1397 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1398 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1399 } 1400 1401 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1402 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1403 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1404 1405 /* Are root indices for self and remote contiguous? */ 1406 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1407 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1408 1409 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1410 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1411 1412 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1413 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1414 } 1415 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1416 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1417 } 1418 1419 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1420 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1421 1422 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1423 if (PetscDefined(HAVE_DEVICE)) { 1424 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1425 if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1426 if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1427 if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1428 if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1429 } 1430 PetscFunctionReturn(0); 1431 } 1432 1433 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1434 { 1435 PetscErrorCode ierr; 1436 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1437 PetscInt i; 1438 1439 PetscFunctionBegin; 1440 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1441 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1442 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1443 #if defined(PETSC_HAVE_DEVICE) 1444 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1445 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1446 #endif 1447 } 1448 PetscFunctionReturn(0); 1449 } 1450