1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 #if defined(PETSC_HAVE_CUDA) 8 #include <cuda_runtime.h> 9 #include <petsccublas.h> 10 #endif 11 #if defined(PETSC_HAVE_HIP) 12 #include <hip/hip_runtime.h> 13 #include <petschipblas.h> 14 #endif 15 /* 16 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 17 * therefore we pack data types manually. This file defines packing routines for the standard data types. 18 */ 19 20 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 21 22 /* Operations working like s += t */ 23 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 25 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 26 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 27 /* Ref MPI MAXLOC */ 28 #define OP_XLOC(op,s,t) \ 29 do { \ 30 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 31 else if (!((s).u op (t).u)) s = t; \ 32 } while (0) 33 34 /* DEF_PackFunc - macro defining a Pack routine 35 36 Arguments of the macro: 37 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 38 .BS Block size for vectorization. It is a factor of bsz. 39 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 40 41 Arguments of the Pack routine: 42 +count Number of indices in idx[]. 43 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 44 .opt Per-pack optimization plan. NULL means no such plan. 45 .idx Indices of entries to packed. 46 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 47 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 48 -packed Address of the packed data. 49 */ 50 #define DEF_PackFunc(Type,BS,EQ) \ 51 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 52 { \ 53 PetscErrorCode ierr; \ 54 const Type *u = (const Type*)unpacked,*u2; \ 55 Type *p = (Type*)packed,*p2; \ 56 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 57 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 58 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 59 PetscFunctionBegin; \ 60 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 61 else if (opt) { /* has optimizations available */ \ 62 p2 = p; \ 63 for (r=0; r<opt->n; r++) { \ 64 u2 = u + opt->start[r]*MBS; \ 65 X = opt->X[r]; \ 66 Y = opt->Y[r]; \ 67 for (k=0; k<opt->dz[r]; k++) \ 68 for (j=0; j<opt->dy[r]; j++) { \ 69 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 70 p2 += opt->dx[r]*MBS; \ 71 } \ 72 } \ 73 } else { \ 74 for (i=0; i<count; i++) \ 75 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 76 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 77 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 78 } \ 79 PetscFunctionReturn(0); \ 80 } 81 82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 83 and inserts into a sparse array. 84 85 Arguments: 86 .Type Type of the data 87 .BS Block size for vectorization 88 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 89 90 Notes: 91 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 92 */ 93 #define DEF_UnpackFunc(Type,BS,EQ) \ 94 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 95 { \ 96 PetscErrorCode ierr; \ 97 Type *u = (Type*)unpacked,*u2; \ 98 const Type *p = (const Type*)packed; \ 99 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 100 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 101 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 102 PetscFunctionBegin; \ 103 if (!idx) { \ 104 u += start*MBS; \ 105 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 106 } else if (opt) { /* has optimizations available */ \ 107 for (r=0; r<opt->n; r++) { \ 108 u2 = u + opt->start[r]*MBS; \ 109 X = opt->X[r]; \ 110 Y = opt->Y[r]; \ 111 for (k=0; k<opt->dz[r]; k++) \ 112 for (j=0; j<opt->dy[r]; j++) { \ 113 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 114 p += opt->dx[r]*MBS; \ 115 } \ 116 } \ 117 } else { \ 118 for (i=0; i<count; i++) \ 119 for (j=0; j<M; j++) \ 120 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 121 } \ 122 PetscFunctionReturn(0); \ 123 } 124 125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 126 127 Arguments: 128 +Opname Name of the Op, such as Add, Mult, LAND, etc. 129 .Type Type of the data 130 .BS Block size for vectorization 131 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 132 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 133 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 134 */ 135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 136 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 137 { \ 138 Type *u = (Type*)unpacked,*u2; \ 139 const Type *p = (const Type*)packed; \ 140 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 141 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 142 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 143 PetscFunctionBegin; \ 144 if (!idx) { \ 145 u += start*MBS; \ 146 for (i=0; i<count; i++) \ 147 for (j=0; j<M; j++) \ 148 for (k=0; k<BS; k++) \ 149 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r=0; r<opt->n; r++) { \ 152 u2 = u + opt->start[r]*MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k=0; k<opt->dz[r]; k++) \ 156 for (j=0; j<opt->dy[r]; j++) { \ 157 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 158 p += opt->dx[r]*MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i=0; i<count; i++) \ 163 for (j=0; j<M; j++) \ 164 for (k=0; k<BS; k++) \ 165 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 171 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 172 { \ 173 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 174 PetscInt i,j,k,r,l,bs=link->bs; \ 175 const PetscInt M = (EQ) ? 1 : bs/BS; \ 176 const PetscInt MBS = M*BS; \ 177 PetscFunctionBegin; \ 178 for (i=0; i<count; i++) { \ 179 r = (!idx ? start+i : idx[i])*MBS; \ 180 l = i*MBS; \ 181 for (j=0; j<M; j++) \ 182 for (k=0; k<BS; k++) { \ 183 tmp = u[r+j*BS+k]; \ 184 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 185 p[l+j*BS+k] = tmp; \ 186 } \ 187 } \ 188 PetscFunctionReturn(0); \ 189 } 190 191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 192 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 193 { \ 194 PetscErrorCode ierr; \ 195 const Type *u = (const Type*)src; \ 196 Type *v = (Type*)dst; \ 197 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 198 const PetscInt M = (EQ) ? 1 : bs/BS; \ 199 const PetscInt MBS = M*BS; \ 200 PetscFunctionBegin; \ 201 if (!srcIdx) { /* src is contiguous */ \ 202 u += srcStart*MBS; \ 203 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 204 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 205 u += srcOpt->start[0]*MBS; \ 206 v += dstStart*MBS; \ 207 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 208 for (k=0; k<srcOpt->dz[0]; k++) \ 209 for (j=0; j<srcOpt->dy[0]; j++) { \ 210 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 211 v += srcOpt->dx[0]*MBS; \ 212 } \ 213 } else { /* all other cases */ \ 214 for (i=0; i<count; i++) { \ 215 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 216 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 217 for (j=0; j<M; j++) \ 218 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 219 } \ 220 } \ 221 PetscFunctionReturn(0); \ 222 } 223 224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 225 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 226 { \ 227 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 228 const Type *ldata = (const Type*)leafdata; \ 229 PetscInt i,j,k,r,l,bs = link->bs; \ 230 const PetscInt M = (EQ) ? 1 : bs/BS; \ 231 const PetscInt MBS = M*BS; \ 232 PetscFunctionBegin; \ 233 for (i=0; i<count; i++) { \ 234 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 235 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 239 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 240 } \ 241 } \ 242 PetscFunctionReturn(0); \ 243 } 244 245 /* Pack, Unpack/Fetch ops */ 246 #define DEF_Pack(Type,BS,EQ) \ 247 DEF_PackFunc(Type,BS,EQ) \ 248 DEF_UnpackFunc(Type,BS,EQ) \ 249 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 250 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 251 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 252 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 253 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 254 } 255 256 /* Add, Mult ops */ 257 #define DEF_Add(Type,BS,EQ) \ 258 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 259 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 260 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 261 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 262 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 263 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 264 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 265 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 266 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 267 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 268 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 269 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 270 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 271 } 272 273 /* Max, Min ops */ 274 #define DEF_Cmp(Type,BS,EQ) \ 275 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 276 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 277 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 278 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 279 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 280 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 281 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 282 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 283 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 284 } 285 286 /* Logical ops. 287 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 288 the compilation warning "empty macro arguments are undefined in ISO C90" 289 */ 290 #define DEF_Log(Type,BS,EQ) \ 291 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 292 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 293 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 294 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 295 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 296 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 297 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 298 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 299 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 300 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 301 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 302 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 303 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 304 } 305 306 /* Bitwise ops */ 307 #define DEF_Bit(Type,BS,EQ) \ 308 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 309 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 310 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 311 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 312 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 313 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 314 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 315 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 316 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 317 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 318 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 319 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 320 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 321 } 322 323 /* Maxloc, Minloc ops */ 324 #define DEF_Xloc(Type,BS,EQ) \ 325 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 326 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 327 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 328 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 329 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 330 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 331 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 332 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 333 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 334 } 335 336 #define DEF_IntegerType(Type,BS,EQ) \ 337 DEF_Pack(Type,BS,EQ) \ 338 DEF_Add(Type,BS,EQ) \ 339 DEF_Cmp(Type,BS,EQ) \ 340 DEF_Log(Type,BS,EQ) \ 341 DEF_Bit(Type,BS,EQ) \ 342 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 343 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 344 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 345 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 346 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 347 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 348 } 349 350 #define DEF_RealType(Type,BS,EQ) \ 351 DEF_Pack(Type,BS,EQ) \ 352 DEF_Add(Type,BS,EQ) \ 353 DEF_Cmp(Type,BS,EQ) \ 354 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 355 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 356 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 357 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 358 } 359 360 #if defined(PETSC_HAVE_COMPLEX) 361 #define DEF_ComplexType(Type,BS,EQ) \ 362 DEF_Pack(Type,BS,EQ) \ 363 DEF_Add(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 367 } 368 #endif 369 370 #define DEF_DumbType(Type,BS,EQ) \ 371 DEF_Pack(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 } 375 376 /* Maxloc, Minloc */ 377 #define DEF_PairType(Type,BS,EQ) \ 378 DEF_Pack(Type,BS,EQ) \ 379 DEF_Xloc(Type,BS,EQ) \ 380 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 381 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 382 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 383 } 384 385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 393 394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 395 DEF_IntegerType(int,1,1) 396 DEF_IntegerType(int,2,1) 397 DEF_IntegerType(int,4,1) 398 DEF_IntegerType(int,8,1) 399 DEF_IntegerType(int,1,0) 400 DEF_IntegerType(int,2,0) 401 DEF_IntegerType(int,4,0) 402 DEF_IntegerType(int,8,0) 403 #endif 404 405 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 406 typedef signed char SignedChar; 407 DEF_IntegerType(SignedChar,1,1) 408 DEF_IntegerType(SignedChar,2,1) 409 DEF_IntegerType(SignedChar,4,1) 410 DEF_IntegerType(SignedChar,8,1) 411 DEF_IntegerType(SignedChar,1,0) 412 DEF_IntegerType(SignedChar,2,0) 413 DEF_IntegerType(SignedChar,4,0) 414 DEF_IntegerType(SignedChar,8,0) 415 416 typedef unsigned char UnsignedChar; 417 DEF_IntegerType(UnsignedChar,1,1) 418 DEF_IntegerType(UnsignedChar,2,1) 419 DEF_IntegerType(UnsignedChar,4,1) 420 DEF_IntegerType(UnsignedChar,8,1) 421 DEF_IntegerType(UnsignedChar,1,0) 422 DEF_IntegerType(UnsignedChar,2,0) 423 DEF_IntegerType(UnsignedChar,4,0) 424 DEF_IntegerType(UnsignedChar,8,0) 425 426 DEF_RealType(PetscReal,1,1) 427 DEF_RealType(PetscReal,2,1) 428 DEF_RealType(PetscReal,4,1) 429 DEF_RealType(PetscReal,8,1) 430 DEF_RealType(PetscReal,1,0) 431 DEF_RealType(PetscReal,2,0) 432 DEF_RealType(PetscReal,4,0) 433 DEF_RealType(PetscReal,8,0) 434 435 #if defined(PETSC_HAVE_COMPLEX) 436 DEF_ComplexType(PetscComplex,1,1) 437 DEF_ComplexType(PetscComplex,2,1) 438 DEF_ComplexType(PetscComplex,4,1) 439 DEF_ComplexType(PetscComplex,8,1) 440 DEF_ComplexType(PetscComplex,1,0) 441 DEF_ComplexType(PetscComplex,2,0) 442 DEF_ComplexType(PetscComplex,4,0) 443 DEF_ComplexType(PetscComplex,8,0) 444 #endif 445 446 #define PairType(Type1,Type2) Type1##_##Type2 447 typedef struct {int u; int i;} PairType(int,int); 448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 449 DEF_PairType(PairType(int,int),1,1) 450 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 451 452 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 453 DEF_DumbType(char,1,1) 454 DEF_DumbType(char,2,1) 455 DEF_DumbType(char,4,1) 456 DEF_DumbType(char,1,0) 457 DEF_DumbType(char,2,0) 458 DEF_DumbType(char,4,0) 459 460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 461 DEF_DumbType(DumbInt,1,1) 462 DEF_DumbType(DumbInt,2,1) 463 DEF_DumbType(DumbInt,4,1) 464 DEF_DumbType(DumbInt,8,1) 465 DEF_DumbType(DumbInt,1,0) 466 DEF_DumbType(DumbInt,2,0) 467 DEF_DumbType(DumbInt,4,0) 468 DEF_DumbType(DumbInt,8,0) 469 470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 472 { 473 int ierr; 474 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 475 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 476 return MPI_SUCCESS; 477 } 478 #endif 479 480 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 481 { 482 PetscErrorCode ierr; 483 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 484 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 485 486 PetscFunctionBegin; 487 /* Destroy device-specific fields */ 488 if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 489 490 /* Destroy host related fields */ 491 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 492 if (!link->use_nvshmem) { 493 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 494 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 495 } 496 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 497 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 498 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 499 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 500 } 501 } 502 ierr = PetscFree(link);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 507 { 508 PetscErrorCode ierr; 509 510 PetscFunctionBegin; 511 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 512 #if defined(PETSC_HAVE_NVSHMEM) 513 { 514 PetscBool use_nvshmem; 515 ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 516 if (use_nvshmem) { 517 ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 518 PetscFunctionReturn(0); 519 } 520 } 521 #endif 522 ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 523 PetscFunctionReturn(0); 524 } 525 526 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 527 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 528 */ 529 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 530 { 531 PetscErrorCode ierr; 532 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 533 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 534 const PetscInt *rootoffset,*leafoffset; 535 PetscMPIInt n; 536 MPI_Aint disp; 537 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 538 MPI_Datatype unit = link->unit; 539 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 540 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 541 542 PetscFunctionBegin; 543 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 544 if (sf->persistent) { 545 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 546 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 547 if (direction == PETSCSF_LEAF2ROOT) { 548 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 549 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 550 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 551 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 552 } 553 } else { /* PETSCSF_ROOT2LEAF */ 554 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 555 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 556 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 557 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 558 } 559 } 560 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 561 } 562 563 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 564 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 565 if (direction == PETSCSF_LEAF2ROOT) { 566 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 567 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 568 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 569 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 570 } 571 } else { /* PETSCSF_ROOT2LEAF */ 572 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 573 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 574 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 575 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 576 } 577 } 578 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 579 } 580 } 581 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 582 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 583 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 584 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 585 PetscFunctionReturn(0); 586 } 587 588 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 589 { 590 PetscErrorCode ierr; 591 PetscSFLink link,*p; 592 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 593 594 PetscFunctionBegin; 595 /* Look for types in cache */ 596 for (p=&bas->inuse; (link=*p); p=&link->next) { 597 PetscBool match; 598 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 599 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 600 switch (cmode) { 601 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 602 case PETSC_USE_POINTER: break; 603 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 604 } 605 *mylink = link; 606 PetscFunctionReturn(0); 607 } 608 } 609 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 610 } 611 612 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 613 { 614 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 615 PetscSFLink link = *mylink; 616 617 PetscFunctionBegin; 618 link->rootdata = NULL; 619 link->leafdata = NULL; 620 link->next = bas->avail; 621 bas->avail = link; 622 *mylink = NULL; 623 PetscFunctionReturn(0); 624 } 625 626 /* Error out on unsupported overlapped communications */ 627 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 628 { 629 PetscErrorCode ierr; 630 PetscSFLink link,*p; 631 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 632 PetscBool match; 633 634 PetscFunctionBegin; 635 if (PetscDefined(USE_DEBUG)) { 636 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 637 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 638 */ 639 if (rootdata || leafdata) { 640 for (p=&bas->inuse; (link=*p); p=&link->next) { 641 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 642 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 643 } 644 } 645 } 646 PetscFunctionReturn(0); 647 } 648 649 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 650 { 651 PetscFunctionBegin; 652 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 653 PetscFunctionReturn(0); 654 } 655 656 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 657 { 658 PetscErrorCode ierr; 659 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 660 PetscBool is2Int,is2PetscInt; 661 PetscMPIInt ni,na,nd,combiner; 662 #if defined(PETSC_HAVE_COMPLEX) 663 PetscInt nPetscComplex=0; 664 #endif 665 666 PetscFunctionBegin; 667 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 668 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 669 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 670 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 671 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 672 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 673 #if defined(PETSC_HAVE_COMPLEX) 674 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 675 #endif 676 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 677 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 678 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 679 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 680 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 681 link->bs = 1; /* default */ 682 683 if (is2Int) { 684 PackInit_PairType_int_int_1_1(link); 685 link->bs = 1; 686 link->unitbytes = 2*sizeof(int); 687 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 688 link->basicunit = MPI_2INT; 689 link->unit = MPI_2INT; 690 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 691 PackInit_PairType_PetscInt_PetscInt_1_1(link); 692 link->bs = 1; 693 link->unitbytes = 2*sizeof(PetscInt); 694 link->basicunit = MPIU_2INT; 695 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 696 link->unit = MPIU_2INT; 697 } else if (nPetscReal) { 698 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 699 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 700 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 701 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 702 link->bs = nPetscReal; 703 link->unitbytes = nPetscReal*sizeof(PetscReal); 704 link->basicunit = MPIU_REAL; 705 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 706 } else if (nPetscInt) { 707 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 708 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 709 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 710 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 711 link->bs = nPetscInt; 712 link->unitbytes = nPetscInt*sizeof(PetscInt); 713 link->basicunit = MPIU_INT; 714 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 715 #if defined(PETSC_USE_64BIT_INDICES) 716 } else if (nInt) { 717 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 718 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 719 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 720 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 721 link->bs = nInt; 722 link->unitbytes = nInt*sizeof(int); 723 link->basicunit = MPI_INT; 724 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 725 #endif 726 } else if (nSignedChar) { 727 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 728 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 729 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 730 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 731 link->bs = nSignedChar; 732 link->unitbytes = nSignedChar*sizeof(SignedChar); 733 link->basicunit = MPI_SIGNED_CHAR; 734 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 735 } else if (nUnsignedChar) { 736 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 737 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 738 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 739 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 740 link->bs = nUnsignedChar; 741 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 742 link->basicunit = MPI_UNSIGNED_CHAR; 743 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 744 #if defined(PETSC_HAVE_COMPLEX) 745 } else if (nPetscComplex) { 746 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 747 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 748 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 749 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 750 link->bs = nPetscComplex; 751 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 752 link->basicunit = MPIU_COMPLEX; 753 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 754 #endif 755 } else { 756 MPI_Aint lb,nbyte; 757 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 758 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 759 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 760 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 761 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 762 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 763 link->bs = nbyte; 764 link->unitbytes = nbyte; 765 link->basicunit = MPI_BYTE; 766 } else { 767 nInt = nbyte / sizeof(int); 768 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 769 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 770 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 771 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 772 link->bs = nInt; 773 link->unitbytes = nbyte; 774 link->basicunit = MPI_INT; 775 } 776 if (link->isbuiltin) link->unit = unit; 777 } 778 779 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 780 781 link->Memcpy = PetscSFLinkMemcpy_Host; 782 PetscFunctionReturn(0); 783 } 784 785 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 786 { 787 PetscFunctionBegin; 788 *UnpackAndOp = NULL; 789 if (PetscMemTypeHost(mtype)) { 790 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 791 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 792 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 793 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 794 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 795 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 796 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 797 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 798 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 799 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 800 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 801 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 802 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 803 } 804 #if defined(PETSC_HAVE_DEVICE) 805 else if (PetscMemTypeDevice(mtype) && !atomic) { 806 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 807 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 808 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 809 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 810 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 811 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 812 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 813 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 814 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 815 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 816 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 817 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 818 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 819 } else if (PetscMemTypeDevice(mtype) && atomic) { 820 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 821 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 822 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 823 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 824 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 825 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 826 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 827 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 828 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 829 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 830 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 831 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 832 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 833 } 834 #endif 835 PetscFunctionReturn(0); 836 } 837 838 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 839 { 840 PetscFunctionBegin; 841 *ScatterAndOp = NULL; 842 if (PetscMemTypeHost(mtype)) { 843 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 844 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 845 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 846 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 847 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 848 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 849 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 850 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 851 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 852 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 853 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 854 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 855 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 856 } 857 #if defined(PETSC_HAVE_DEVICE) 858 else if (PetscMemTypeDevice(mtype) && !atomic) { 859 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 860 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 861 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 862 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 863 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 864 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 865 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 866 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 867 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 868 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 869 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 870 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 871 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 872 } else if (PetscMemTypeDevice(mtype) && atomic) { 873 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 874 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 875 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 876 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 877 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 878 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 879 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 880 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 881 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 882 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 883 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 884 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 885 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 886 } 887 #endif 888 PetscFunctionReturn(0); 889 } 890 891 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 892 { 893 PetscFunctionBegin; 894 *FetchAndOp = NULL; 895 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 896 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 897 #if defined(PETSC_HAVE_DEVICE) 898 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 899 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 900 #endif 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 905 { 906 PetscFunctionBegin; 907 *FetchAndOpLocal = NULL; 908 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 909 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 910 #if defined(PETSC_HAVE_DEVICE) 911 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 912 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 913 #endif 914 PetscFunctionReturn(0); 915 } 916 917 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 918 { 919 PetscErrorCode ierr; 920 PetscLogDouble flops; 921 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 922 923 PetscFunctionBegin; 924 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 925 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 926 #if defined(PETSC_HAVE_DEVICE) 927 if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 928 #endif 929 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 930 } 931 PetscFunctionReturn(0); 932 } 933 934 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 935 { 936 PetscLogDouble flops; 937 PetscErrorCode ierr; 938 939 PetscFunctionBegin; 940 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 941 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 942 #if defined(PETSC_HAVE_DEVICE) 943 if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 944 #endif 945 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 946 } 947 PetscFunctionReturn(0); 948 } 949 950 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 951 Input Arguments: 952 +sf - The StarForest 953 .link - The link 954 .count - Number of entries to unpack 955 .start - The first index, significent when indices=NULL 956 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 957 .buf - A contiguous buffer to unpack from 958 -op - Operation after unpack 959 960 Output Arguments: 961 .data - The data to unpack to 962 */ 963 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 964 { 965 PetscFunctionBegin; 966 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 967 { 968 PetscErrorCode ierr; 969 PetscInt i; 970 PetscMPIInt n; 971 if (indices) { 972 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 973 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 974 */ 975 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 976 } else { 977 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 978 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); 979 } 980 } 981 PetscFunctionReturn(0); 982 #else 983 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 984 #endif 985 } 986 987 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 988 { 989 PetscFunctionBegin; 990 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 991 { 992 PetscErrorCode ierr; 993 PetscInt i,disp; 994 if (!srcIdx) { 995 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 996 } else { 997 for (i=0; i<count; i++) { 998 disp = dstIdx? dstIdx[i] : dstStart + i; 999 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 1000 } 1001 } 1002 } 1003 PetscFunctionReturn(0); 1004 #else 1005 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1006 #endif 1007 } 1008 1009 /*============================================================================= 1010 Pack/Unpack/Fetch/Scatter routines 1011 ============================================================================*/ 1012 1013 /* Pack rootdata to rootbuf 1014 Input Arguments: 1015 + sf - The SF this packing works on. 1016 . link - It gives the memtype of the roots and also provides root buffer. 1017 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1018 - rootdata - Where to read the roots. 1019 1020 Notes: 1021 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1022 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1023 */ 1024 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1025 { 1026 PetscErrorCode ierr; 1027 const PetscInt *rootindices = NULL; 1028 PetscInt count,start; 1029 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1030 PetscMemType rootmtype = link->rootmtype; 1031 PetscSFPackOpt opt = NULL; 1032 1033 PetscFunctionBegin; 1034 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1035 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1036 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1037 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1038 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1039 } 1040 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1041 PetscFunctionReturn(0); 1042 } 1043 1044 /* Pack leafdata to leafbuf */ 1045 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1046 { 1047 PetscErrorCode ierr; 1048 const PetscInt *leafindices = NULL; 1049 PetscInt count,start; 1050 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1051 PetscMemType leafmtype = link->leafmtype; 1052 PetscSFPackOpt opt = NULL; 1053 1054 PetscFunctionBegin; 1055 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1056 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1057 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1058 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1059 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1060 } 1061 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1062 PetscFunctionReturn(0); 1063 } 1064 1065 /* Pack rootdata to rootbuf, which are in the same memory space */ 1066 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1067 { 1068 PetscErrorCode ierr; 1069 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1070 1071 PetscFunctionBegin; 1072 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1073 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1074 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1075 } 1076 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1077 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 1078 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1079 PetscFunctionReturn(0); 1080 } 1081 /* Pack leafdata to leafbuf, which are in the same memory space */ 1082 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1083 { 1084 PetscErrorCode ierr; 1085 1086 PetscFunctionBegin; 1087 if (scope == PETSCSF_REMOTE) { 1088 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1089 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1090 } 1091 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1092 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 1093 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1094 PetscFunctionReturn(0); 1095 } 1096 1097 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1098 { 1099 PetscErrorCode ierr; 1100 const PetscInt *rootindices = NULL; 1101 PetscInt count,start; 1102 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1103 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1104 PetscMemType rootmtype = link->rootmtype; 1105 PetscSFPackOpt opt = NULL; 1106 1107 PetscFunctionBegin; 1108 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1109 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1110 if (UnpackAndOp) { 1111 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1112 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1113 } else { 1114 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1115 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1116 } 1117 } 1118 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1119 PetscFunctionReturn(0); 1120 } 1121 1122 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1123 { 1124 PetscErrorCode ierr; 1125 const PetscInt *leafindices = NULL; 1126 PetscInt count,start; 1127 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1128 PetscMemType leafmtype = link->leafmtype; 1129 PetscSFPackOpt opt = NULL; 1130 1131 PetscFunctionBegin; 1132 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1133 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1134 if (UnpackAndOp) { 1135 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1136 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1137 } else { 1138 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1139 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1140 } 1141 } 1142 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1143 PetscFunctionReturn(0); 1144 } 1145 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1146 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1147 { 1148 PetscErrorCode ierr; 1149 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1150 1151 PetscFunctionBegin; 1152 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1153 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1154 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1155 if (scope == PETSCSF_REMOTE) { 1156 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1157 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1158 } 1159 PetscFunctionReturn(0); 1160 } 1161 1162 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1163 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1164 { 1165 PetscErrorCode ierr; 1166 1167 PetscFunctionBegin; 1168 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1169 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);} 1170 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1171 if (scope == PETSCSF_REMOTE) { 1172 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1173 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1174 } 1175 PetscFunctionReturn(0); 1176 } 1177 1178 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1179 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1180 { 1181 PetscErrorCode ierr; 1182 const PetscInt *rootindices = NULL; 1183 PetscInt count,start; 1184 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1185 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1186 PetscMemType rootmtype = link->rootmtype; 1187 PetscSFPackOpt opt = NULL; 1188 1189 PetscFunctionBegin; 1190 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1191 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1192 /* Do FetchAndOp on rootdata with rootbuf */ 1193 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 1194 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1195 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1196 } 1197 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1198 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1203 { 1204 PetscErrorCode ierr; 1205 const PetscInt *rootindices = NULL,*leafindices = NULL; 1206 PetscInt count,rootstart,leafstart; 1207 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1208 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1209 PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1210 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1211 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1212 char *srcbuf = NULL,*dstbuf = NULL; 1213 PetscBool dstdups; 1214 1215 if (!buflen) PetscFunctionReturn(0); 1216 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1217 if (direction == PETSCSF_ROOT2LEAF) { 1218 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1219 srcmtype = rootmtype; 1220 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1221 dstmtype = leafmtype; 1222 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1223 } else { 1224 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1225 srcmtype = leafmtype; 1226 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1227 dstmtype = rootmtype; 1228 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1229 } 1230 ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 1231 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1232 if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 1233 if (direction == PETSCSF_ROOT2LEAF) { 1234 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1235 } else { 1236 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1237 } 1238 } else { 1239 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1240 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1241 ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1242 if (ScatterAndOp) { 1243 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1244 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1245 if (direction == PETSCSF_ROOT2LEAF) { 1246 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1247 } else { 1248 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1249 } 1250 } else { 1251 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1252 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1253 if (direction == PETSCSF_ROOT2LEAF) { 1254 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1255 } else { 1256 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1257 } 1258 } 1259 } 1260 PetscFunctionReturn(0); 1261 } 1262 1263 /* Fetch rootdata to leafdata and leafupdate locally */ 1264 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1265 { 1266 PetscErrorCode ierr; 1267 const PetscInt *rootindices = NULL,*leafindices = NULL; 1268 PetscInt count,rootstart,leafstart; 1269 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1270 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1271 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1272 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1273 1274 PetscFunctionBegin; 1275 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1276 if (rootmtype != leafmtype) { 1277 /* The local communication has to go through pack and unpack */ 1278 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1279 } else { 1280 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1281 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1282 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1283 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1284 } 1285 PetscFunctionReturn(0); 1286 } 1287 1288 /* 1289 Create per-rank pack/unpack optimizations based on indice patterns 1290 1291 Input Parameters: 1292 + n - Number of destination ranks 1293 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1294 - idx - [*] Array storing indices 1295 1296 Output Parameters: 1297 + opt - Pack optimizations. NULL if no optimizations. 1298 */ 1299 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1300 { 1301 PetscErrorCode ierr; 1302 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1303 PetscBool optimizable = PETSC_TRUE; 1304 PetscSFPackOpt opt; 1305 1306 PetscFunctionBegin; 1307 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1308 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1309 opt->n = opt->array[0] = n; 1310 opt->offset = opt->array + 1; 1311 opt->start = opt->array + n + 2; 1312 opt->dx = opt->array + 2*n + 2; 1313 opt->dy = opt->array + 3*n + 2; 1314 opt->dz = opt->array + 4*n + 2; 1315 opt->X = opt->array + 5*n + 2; 1316 opt->Y = opt->array + 6*n + 2; 1317 1318 for (r=0; r<n; r++) { /* For each destination rank */ 1319 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1320 p = offset[r]; 1321 start = idx[p]; /* First index for this rank */ 1322 p++; 1323 1324 /* Search in X dimension */ 1325 for (dx=1; dx<m; dx++,p++) { 1326 if (start+dx != idx[p]) break; 1327 } 1328 1329 dydz = m/dx; 1330 X = dydz > 1 ? (idx[p]-start) : dx; 1331 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1332 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1333 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1334 for (i=0; i<dx; i++,p++) { 1335 if (start+X*dy+i != idx[p]) { 1336 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1337 else goto Z_dimension; 1338 } 1339 } 1340 } 1341 1342 Z_dimension: 1343 dz = m/(dx*dy); 1344 Y = dz > 1 ? (idx[p]-start)/X : dy; 1345 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1346 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1347 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1348 for (j=0; j<dy; j++) { 1349 for (i=0; i<dx; i++,p++) { 1350 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1351 } 1352 } 1353 } 1354 opt->start[r] = start; 1355 opt->dx[r] = dx; 1356 opt->dy[r] = dy; 1357 opt->dz[r] = dz; 1358 opt->X[r] = X; 1359 opt->Y[r] = Y; 1360 } 1361 1362 finish: 1363 /* If not optimizable, free arrays to save memory */ 1364 if (!n || !optimizable) { 1365 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1366 ierr = PetscFree(opt);CHKERRQ(ierr); 1367 *out = NULL; 1368 } else { 1369 opt->offset[0] = 0; 1370 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1371 *out = opt; 1372 } 1373 PetscFunctionReturn(0); 1374 } 1375 1376 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1377 { 1378 PetscErrorCode ierr; 1379 PetscSFPackOpt opt = *out; 1380 1381 PetscFunctionBegin; 1382 if (opt) { 1383 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1384 ierr = PetscFree(opt);CHKERRQ(ierr); 1385 *out = NULL; 1386 } 1387 PetscFunctionReturn(0); 1388 } 1389 1390 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1391 { 1392 PetscErrorCode ierr; 1393 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1394 PetscInt i,j; 1395 1396 PetscFunctionBegin; 1397 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1398 for (i=0; i<2; i++) { /* Set defaults */ 1399 sf->leafstart[i] = 0; 1400 sf->leafcontig[i] = PETSC_TRUE; 1401 sf->leafdups[i] = PETSC_FALSE; 1402 bas->rootstart[i] = 0; 1403 bas->rootcontig[i] = PETSC_TRUE; 1404 bas->rootdups[i] = PETSC_FALSE; 1405 } 1406 1407 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1408 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1409 1410 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1411 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1412 1413 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1414 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1415 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1416 } 1417 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1418 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1419 } 1420 1421 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1422 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1423 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1424 1425 /* Are root indices for self and remote contiguous? */ 1426 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1427 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1428 1429 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1430 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1431 1432 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1433 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1434 } 1435 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1436 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1437 } 1438 1439 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1440 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1441 1442 #if defined(PETSC_HAVE_DEVICE) 1443 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1444 if (PetscDefined(HAVE_DEVICE)) { 1445 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1446 if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1447 if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1448 if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1449 if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1450 } 1451 #endif 1452 PetscFunctionReturn(0); 1453 } 1454 1455 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1456 { 1457 PetscErrorCode ierr; 1458 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1459 PetscInt i; 1460 1461 PetscFunctionBegin; 1462 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1463 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1464 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1465 #if defined(PETSC_HAVE_DEVICE) 1466 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1467 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1468 #endif 1469 } 1470 PetscFunctionReturn(0); 1471 } 1472