1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 #if defined(PETSC_HAVE_CUDA) 8 #include <cuda_runtime.h> 9 #include <petsccublas.h> 10 #endif 11 #if defined(PETSC_HAVE_HIP) 12 #include <hip/hip_runtime.h> 13 #include <petschipblas.h> 14 #endif 15 /* 16 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 17 * therefore we pack data types manually. This file defines packing routines for the standard data types. 18 */ 19 20 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 21 22 /* Operations working like s += t */ 23 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 25 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 26 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 27 /* Ref MPI MAXLOC */ 28 #define OP_XLOC(op,s,t) \ 29 do { \ 30 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 31 else if (!((s).u op (t).u)) s = t; \ 32 } while (0) 33 34 /* DEF_PackFunc - macro defining a Pack routine 35 36 Arguments of the macro: 37 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 38 .BS Block size for vectorization. It is a factor of bsz. 39 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 40 41 Arguments of the Pack routine: 42 +count Number of indices in idx[]. 43 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 44 .opt Per-pack optimization plan. NULL means no such plan. 45 .idx Indices of entries to packed. 46 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 47 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 48 -packed Address of the packed data. 49 */ 50 #define DEF_PackFunc(Type,BS,EQ) \ 51 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 52 { \ 53 PetscErrorCode ierr; \ 54 const Type *u = (const Type*)unpacked,*u2; \ 55 Type *p = (Type*)packed,*p2; \ 56 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 57 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 58 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 59 PetscFunctionBegin; \ 60 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 61 else if (opt) { /* has optimizations available */ \ 62 p2 = p; \ 63 for (r=0; r<opt->n; r++) { \ 64 u2 = u + opt->start[r]*MBS; \ 65 X = opt->X[r]; \ 66 Y = opt->Y[r]; \ 67 for (k=0; k<opt->dz[r]; k++) \ 68 for (j=0; j<opt->dy[r]; j++) { \ 69 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 70 p2 += opt->dx[r]*MBS; \ 71 } \ 72 } \ 73 } else { \ 74 for (i=0; i<count; i++) \ 75 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 76 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 77 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 78 } \ 79 PetscFunctionReturn(0); \ 80 } 81 82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 83 and inserts into a sparse array. 84 85 Arguments: 86 .Type Type of the data 87 .BS Block size for vectorization 88 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 89 90 Notes: 91 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 92 */ 93 #define DEF_UnpackFunc(Type,BS,EQ) \ 94 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 95 { \ 96 PetscErrorCode ierr; \ 97 Type *u = (Type*)unpacked,*u2; \ 98 const Type *p = (const Type*)packed; \ 99 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 100 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 101 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 102 PetscFunctionBegin; \ 103 if (!idx) { \ 104 u += start*MBS; \ 105 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 106 } else if (opt) { /* has optimizations available */ \ 107 for (r=0; r<opt->n; r++) { \ 108 u2 = u + opt->start[r]*MBS; \ 109 X = opt->X[r]; \ 110 Y = opt->Y[r]; \ 111 for (k=0; k<opt->dz[r]; k++) \ 112 for (j=0; j<opt->dy[r]; j++) { \ 113 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 114 p += opt->dx[r]*MBS; \ 115 } \ 116 } \ 117 } else { \ 118 for (i=0; i<count; i++) \ 119 for (j=0; j<M; j++) \ 120 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 121 } \ 122 PetscFunctionReturn(0); \ 123 } 124 125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 126 127 Arguments: 128 +Opname Name of the Op, such as Add, Mult, LAND, etc. 129 .Type Type of the data 130 .BS Block size for vectorization 131 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 132 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 133 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 134 */ 135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 136 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 137 { \ 138 Type *u = (Type*)unpacked,*u2; \ 139 const Type *p = (const Type*)packed; \ 140 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 141 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 142 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 143 PetscFunctionBegin; \ 144 if (!idx) { \ 145 u += start*MBS; \ 146 for (i=0; i<count; i++) \ 147 for (j=0; j<M; j++) \ 148 for (k=0; k<BS; k++) \ 149 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r=0; r<opt->n; r++) { \ 152 u2 = u + opt->start[r]*MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k=0; k<opt->dz[r]; k++) \ 156 for (j=0; j<opt->dy[r]; j++) { \ 157 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 158 p += opt->dx[r]*MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i=0; i<count; i++) \ 163 for (j=0; j<M; j++) \ 164 for (k=0; k<BS; k++) \ 165 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 171 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 172 { \ 173 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 174 PetscInt i,j,k,r,l,bs=link->bs; \ 175 const PetscInt M = (EQ) ? 1 : bs/BS; \ 176 const PetscInt MBS = M*BS; \ 177 PetscFunctionBegin; \ 178 for (i=0; i<count; i++) { \ 179 r = (!idx ? start+i : idx[i])*MBS; \ 180 l = i*MBS; \ 181 for (j=0; j<M; j++) \ 182 for (k=0; k<BS; k++) { \ 183 tmp = u[r+j*BS+k]; \ 184 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 185 p[l+j*BS+k] = tmp; \ 186 } \ 187 } \ 188 PetscFunctionReturn(0); \ 189 } 190 191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 192 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 193 { \ 194 PetscErrorCode ierr; \ 195 const Type *u = (const Type*)src; \ 196 Type *v = (Type*)dst; \ 197 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 198 const PetscInt M = (EQ) ? 1 : bs/BS; \ 199 const PetscInt MBS = M*BS; \ 200 PetscFunctionBegin; \ 201 if (!srcIdx) { /* src is contiguous */ \ 202 u += srcStart*MBS; \ 203 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 204 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 205 u += srcOpt->start[0]*MBS; \ 206 v += dstStart*MBS; \ 207 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 208 for (k=0; k<srcOpt->dz[0]; k++) \ 209 for (j=0; j<srcOpt->dy[0]; j++) { \ 210 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 211 v += srcOpt->dx[0]*MBS; \ 212 } \ 213 } else { /* all other cases */ \ 214 for (i=0; i<count; i++) { \ 215 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 216 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 217 for (j=0; j<M; j++) \ 218 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 219 } \ 220 } \ 221 PetscFunctionReturn(0); \ 222 } 223 224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 225 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 226 { \ 227 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 228 const Type *ldata = (const Type*)leafdata; \ 229 PetscInt i,j,k,r,l,bs = link->bs; \ 230 const PetscInt M = (EQ) ? 1 : bs/BS; \ 231 const PetscInt MBS = M*BS; \ 232 PetscFunctionBegin; \ 233 for (i=0; i<count; i++) { \ 234 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 235 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 239 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 240 } \ 241 } \ 242 PetscFunctionReturn(0); \ 243 } 244 245 /* Pack, Unpack/Fetch ops */ 246 #define DEF_Pack(Type,BS,EQ) \ 247 DEF_PackFunc(Type,BS,EQ) \ 248 DEF_UnpackFunc(Type,BS,EQ) \ 249 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 250 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 251 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 252 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 253 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 254 } 255 256 /* Add, Mult ops */ 257 #define DEF_Add(Type,BS,EQ) \ 258 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 259 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 260 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 261 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 262 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 263 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 264 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 265 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 266 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 267 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 268 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 269 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 270 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 271 } 272 273 /* Max, Min ops */ 274 #define DEF_Cmp(Type,BS,EQ) \ 275 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 276 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 277 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 278 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 279 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 280 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 281 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 282 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 283 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 284 } 285 286 /* Logical ops. 287 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 288 the compilation warning "empty macro arguments are undefined in ISO C90" 289 */ 290 #define DEF_Log(Type,BS,EQ) \ 291 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 292 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 293 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 294 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 295 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 296 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 297 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 298 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 299 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 300 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 301 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 302 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 303 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 304 } 305 306 /* Bitwise ops */ 307 #define DEF_Bit(Type,BS,EQ) \ 308 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 309 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 310 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 311 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 312 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 313 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 314 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 315 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 316 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 317 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 318 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 319 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 320 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 321 } 322 323 /* Maxloc, Minloc ops */ 324 #define DEF_Xloc(Type,BS,EQ) \ 325 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 326 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 327 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 328 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 329 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 330 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 331 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 332 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 333 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 334 } 335 336 #define DEF_IntegerType(Type,BS,EQ) \ 337 DEF_Pack(Type,BS,EQ) \ 338 DEF_Add(Type,BS,EQ) \ 339 DEF_Cmp(Type,BS,EQ) \ 340 DEF_Log(Type,BS,EQ) \ 341 DEF_Bit(Type,BS,EQ) \ 342 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 343 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 344 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 345 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 346 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 347 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 348 } 349 350 #define DEF_RealType(Type,BS,EQ) \ 351 DEF_Pack(Type,BS,EQ) \ 352 DEF_Add(Type,BS,EQ) \ 353 DEF_Cmp(Type,BS,EQ) \ 354 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 355 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 356 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 357 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 358 } 359 360 #if defined(PETSC_HAVE_COMPLEX) 361 #define DEF_ComplexType(Type,BS,EQ) \ 362 DEF_Pack(Type,BS,EQ) \ 363 DEF_Add(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 367 } 368 #endif 369 370 #define DEF_DumbType(Type,BS,EQ) \ 371 DEF_Pack(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 } 375 376 /* Maxloc, Minloc */ 377 #define DEF_PairType(Type,BS,EQ) \ 378 DEF_Pack(Type,BS,EQ) \ 379 DEF_Xloc(Type,BS,EQ) \ 380 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 381 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 382 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 383 } 384 385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 393 394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 395 DEF_IntegerType(int,1,1) 396 DEF_IntegerType(int,2,1) 397 DEF_IntegerType(int,4,1) 398 DEF_IntegerType(int,8,1) 399 DEF_IntegerType(int,1,0) 400 DEF_IntegerType(int,2,0) 401 DEF_IntegerType(int,4,0) 402 DEF_IntegerType(int,8,0) 403 #endif 404 405 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 406 typedef signed char SignedChar; 407 DEF_IntegerType(SignedChar,1,1) 408 DEF_IntegerType(SignedChar,2,1) 409 DEF_IntegerType(SignedChar,4,1) 410 DEF_IntegerType(SignedChar,8,1) 411 DEF_IntegerType(SignedChar,1,0) 412 DEF_IntegerType(SignedChar,2,0) 413 DEF_IntegerType(SignedChar,4,0) 414 DEF_IntegerType(SignedChar,8,0) 415 416 typedef unsigned char UnsignedChar; 417 DEF_IntegerType(UnsignedChar,1,1) 418 DEF_IntegerType(UnsignedChar,2,1) 419 DEF_IntegerType(UnsignedChar,4,1) 420 DEF_IntegerType(UnsignedChar,8,1) 421 DEF_IntegerType(UnsignedChar,1,0) 422 DEF_IntegerType(UnsignedChar,2,0) 423 DEF_IntegerType(UnsignedChar,4,0) 424 DEF_IntegerType(UnsignedChar,8,0) 425 426 DEF_RealType(PetscReal,1,1) 427 DEF_RealType(PetscReal,2,1) 428 DEF_RealType(PetscReal,4,1) 429 DEF_RealType(PetscReal,8,1) 430 DEF_RealType(PetscReal,1,0) 431 DEF_RealType(PetscReal,2,0) 432 DEF_RealType(PetscReal,4,0) 433 DEF_RealType(PetscReal,8,0) 434 435 #if defined(PETSC_HAVE_COMPLEX) 436 DEF_ComplexType(PetscComplex,1,1) 437 DEF_ComplexType(PetscComplex,2,1) 438 DEF_ComplexType(PetscComplex,4,1) 439 DEF_ComplexType(PetscComplex,8,1) 440 DEF_ComplexType(PetscComplex,1,0) 441 DEF_ComplexType(PetscComplex,2,0) 442 DEF_ComplexType(PetscComplex,4,0) 443 DEF_ComplexType(PetscComplex,8,0) 444 #endif 445 446 #define PairType(Type1,Type2) Type1##_##Type2 447 typedef struct {int u; int i;} PairType(int,int); 448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 449 DEF_PairType(PairType(int,int),1,1) 450 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 451 452 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 453 DEF_DumbType(char,1,1) 454 DEF_DumbType(char,2,1) 455 DEF_DumbType(char,4,1) 456 DEF_DumbType(char,1,0) 457 DEF_DumbType(char,2,0) 458 DEF_DumbType(char,4,0) 459 460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 461 DEF_DumbType(DumbInt,1,1) 462 DEF_DumbType(DumbInt,2,1) 463 DEF_DumbType(DumbInt,4,1) 464 DEF_DumbType(DumbInt,8,1) 465 DEF_DumbType(DumbInt,1,0) 466 DEF_DumbType(DumbInt,2,0) 467 DEF_DumbType(DumbInt,4,0) 468 DEF_DumbType(DumbInt,8,0) 469 470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 472 { 473 int ierr; 474 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 475 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 476 return MPI_SUCCESS; 477 } 478 #endif 479 480 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 481 { 482 PetscErrorCode ierr; 483 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 484 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 485 486 PetscFunctionBegin; 487 /* Destroy device-specific fields */ 488 if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 489 490 /* Destroy host related fields */ 491 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 492 if (!link->use_nvshmem) { 493 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 494 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 495 } 496 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 497 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 498 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 499 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 500 } 501 } 502 ierr = PetscFree(link);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 507 { 508 PetscErrorCode ierr; 509 510 PetscFunctionBegin; 511 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 512 #if defined(PETSC_HAVE_NVSHMEM) 513 { 514 PetscBool use_nvshmem; 515 ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 516 if (use_nvshmem) { 517 ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 518 PetscFunctionReturn(0); 519 } 520 } 521 #endif 522 ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 523 PetscFunctionReturn(0); 524 } 525 526 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 527 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 528 */ 529 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 530 { 531 PetscErrorCode ierr; 532 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 533 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 534 const PetscInt *rootoffset,*leafoffset; 535 PetscMPIInt n; 536 MPI_Aint disp; 537 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 538 MPI_Datatype unit = link->unit; 539 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 540 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 541 542 PetscFunctionBegin; 543 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 544 if (sf->persistent) { 545 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 546 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 547 if (direction == PETSCSF_LEAF2ROOT) { 548 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 549 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 550 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 551 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 552 } 553 } else { /* PETSCSF_ROOT2LEAF */ 554 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 555 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 556 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 557 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 558 } 559 } 560 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 561 } 562 563 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 564 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 565 if (direction == PETSCSF_LEAF2ROOT) { 566 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 567 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 568 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 569 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 570 } 571 } else { /* PETSCSF_ROOT2LEAF */ 572 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 573 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 574 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 575 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 576 } 577 } 578 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 579 } 580 } 581 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 582 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 583 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 584 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 585 PetscFunctionReturn(0); 586 } 587 588 589 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 590 { 591 PetscErrorCode ierr; 592 PetscSFLink link,*p; 593 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 594 595 PetscFunctionBegin; 596 /* Look for types in cache */ 597 for (p=&bas->inuse; (link=*p); p=&link->next) { 598 PetscBool match; 599 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 600 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 601 switch (cmode) { 602 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 603 case PETSC_USE_POINTER: break; 604 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 605 } 606 *mylink = link; 607 PetscFunctionReturn(0); 608 } 609 } 610 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 611 } 612 613 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 614 { 615 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 616 PetscSFLink link = *mylink; 617 618 PetscFunctionBegin; 619 link->rootdata = NULL; 620 link->leafdata = NULL; 621 link->next = bas->avail; 622 bas->avail = link; 623 *mylink = NULL; 624 PetscFunctionReturn(0); 625 } 626 627 /* Error out on unsupported overlapped communications */ 628 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 629 { 630 PetscErrorCode ierr; 631 PetscSFLink link,*p; 632 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 633 PetscBool match; 634 635 PetscFunctionBegin; 636 if (PetscDefined(USE_DEBUG)) { 637 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 638 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 639 */ 640 if (rootdata || leafdata) { 641 for (p=&bas->inuse; (link=*p); p=&link->next) { 642 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 643 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 644 } 645 } 646 } 647 PetscFunctionReturn(0); 648 } 649 650 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 651 { 652 PetscFunctionBegin; 653 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 654 PetscFunctionReturn(0); 655 } 656 657 658 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 659 { 660 PetscErrorCode ierr; 661 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 662 PetscBool is2Int,is2PetscInt; 663 PetscMPIInt ni,na,nd,combiner; 664 #if defined(PETSC_HAVE_COMPLEX) 665 PetscInt nPetscComplex=0; 666 #endif 667 668 PetscFunctionBegin; 669 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 670 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 671 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 672 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 673 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 674 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 675 #if defined(PETSC_HAVE_COMPLEX) 676 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 677 #endif 678 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 679 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 680 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 681 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 682 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 683 link->bs = 1; /* default */ 684 685 if (is2Int) { 686 PackInit_PairType_int_int_1_1(link); 687 link->bs = 1; 688 link->unitbytes = 2*sizeof(int); 689 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 690 link->basicunit = MPI_2INT; 691 link->unit = MPI_2INT; 692 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 693 PackInit_PairType_PetscInt_PetscInt_1_1(link); 694 link->bs = 1; 695 link->unitbytes = 2*sizeof(PetscInt); 696 link->basicunit = MPIU_2INT; 697 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 698 link->unit = MPIU_2INT; 699 } else if (nPetscReal) { 700 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 701 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 702 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 703 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 704 link->bs = nPetscReal; 705 link->unitbytes = nPetscReal*sizeof(PetscReal); 706 link->basicunit = MPIU_REAL; 707 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 708 } else if (nPetscInt) { 709 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 710 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 711 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 712 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 713 link->bs = nPetscInt; 714 link->unitbytes = nPetscInt*sizeof(PetscInt); 715 link->basicunit = MPIU_INT; 716 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 717 #if defined(PETSC_USE_64BIT_INDICES) 718 } else if (nInt) { 719 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 720 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 721 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 722 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 723 link->bs = nInt; 724 link->unitbytes = nInt*sizeof(int); 725 link->basicunit = MPI_INT; 726 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 727 #endif 728 } else if (nSignedChar) { 729 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 730 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 731 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 732 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 733 link->bs = nSignedChar; 734 link->unitbytes = nSignedChar*sizeof(SignedChar); 735 link->basicunit = MPI_SIGNED_CHAR; 736 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 737 } else if (nUnsignedChar) { 738 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 739 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 740 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 741 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 742 link->bs = nUnsignedChar; 743 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 744 link->basicunit = MPI_UNSIGNED_CHAR; 745 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 746 #if defined(PETSC_HAVE_COMPLEX) 747 } else if (nPetscComplex) { 748 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 749 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 750 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 751 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 752 link->bs = nPetscComplex; 753 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 754 link->basicunit = MPIU_COMPLEX; 755 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 756 #endif 757 } else { 758 MPI_Aint lb,nbyte; 759 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 760 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 761 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 762 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 763 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 764 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 765 link->bs = nbyte; 766 link->unitbytes = nbyte; 767 link->basicunit = MPI_BYTE; 768 } else { 769 nInt = nbyte / sizeof(int); 770 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 771 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 772 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 773 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 774 link->bs = nInt; 775 link->unitbytes = nbyte; 776 link->basicunit = MPI_INT; 777 } 778 if (link->isbuiltin) link->unit = unit; 779 } 780 781 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 782 783 link->Memcpy = PetscSFLinkMemcpy_Host; 784 PetscFunctionReturn(0); 785 } 786 787 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 788 { 789 PetscFunctionBegin; 790 *UnpackAndOp = NULL; 791 if (PetscMemTypeHost(mtype)) { 792 if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 793 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 794 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 795 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 796 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 797 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 798 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 799 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 800 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 801 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 802 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 803 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 804 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 805 } 806 #if defined(PETSC_HAVE_DEVICE) 807 else if (PetscMemTypeDevice(mtype) && !atomic) { 808 if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 809 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 810 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 811 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 812 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 813 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 814 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 815 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 816 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 817 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 818 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 819 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 820 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 821 } else if (PetscMemTypeDevice(mtype) && atomic) { 822 if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 823 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 824 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 825 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 826 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 827 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 828 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 829 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 830 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 831 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 832 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 833 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 834 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 835 } 836 #endif 837 PetscFunctionReturn(0); 838 } 839 840 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 841 { 842 PetscFunctionBegin; 843 *ScatterAndOp = NULL; 844 if (PetscMemTypeHost(mtype)) { 845 if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 846 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 847 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 848 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 849 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 850 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 851 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 852 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 853 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 854 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 855 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 856 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 857 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 858 } 859 #if defined(PETSC_HAVE_DEVICE) 860 else if (PetscMemTypeDevice(mtype) && !atomic) { 861 if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 862 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 863 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 864 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 865 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 866 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 867 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 868 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 869 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 870 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 871 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 872 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 873 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 874 } else if (PetscMemTypeDevice(mtype) && atomic) { 875 if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 876 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 877 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 878 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 879 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 880 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 881 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 882 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 883 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 884 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 885 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 886 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 887 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 888 } 889 #endif 890 PetscFunctionReturn(0); 891 } 892 893 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 894 { 895 PetscFunctionBegin; 896 *FetchAndOp = NULL; 897 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 898 if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 899 #if defined(PETSC_HAVE_DEVICE) 900 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 901 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 902 #endif 903 PetscFunctionReturn(0); 904 } 905 906 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 907 { 908 PetscFunctionBegin; 909 *FetchAndOpLocal = NULL; 910 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 911 if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 912 #if defined(PETSC_HAVE_DEVICE) 913 else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 914 else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 915 #endif 916 PetscFunctionReturn(0); 917 } 918 919 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 920 { 921 PetscErrorCode ierr; 922 PetscLogDouble flops; 923 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 924 925 PetscFunctionBegin; 926 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 927 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 928 #if defined(PETSC_HAVE_DEVICE) 929 if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 930 #endif 931 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 932 } 933 PetscFunctionReturn(0); 934 } 935 936 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 937 { 938 PetscLogDouble flops; 939 PetscErrorCode ierr; 940 941 PetscFunctionBegin; 942 if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 943 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 944 #if defined(PETSC_HAVE_DEVICE) 945 if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 946 #endif 947 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 948 } 949 PetscFunctionReturn(0); 950 } 951 952 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 953 Input Arguments: 954 +sf - The StarForest 955 .link - The link 956 .count - Number of entries to unpack 957 .start - The first index, significent when indices=NULL 958 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 959 .buf - A contiguous buffer to unpack from 960 -op - Operation after unpack 961 962 Output Arguments: 963 .data - The data to unpack to 964 */ 965 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 966 { 967 PetscFunctionBegin; 968 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 969 { 970 PetscErrorCode ierr; 971 PetscInt i; 972 PetscMPIInt n; 973 if (indices) { 974 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 975 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 976 */ 977 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 978 } else { 979 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 980 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); 981 } 982 } 983 PetscFunctionReturn(0); 984 #else 985 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 986 #endif 987 } 988 989 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 990 { 991 PetscFunctionBegin; 992 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 993 { 994 PetscErrorCode ierr; 995 PetscInt i,disp; 996 if (!srcIdx) { 997 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 998 } else { 999 for (i=0; i<count; i++) { 1000 disp = dstIdx? dstIdx[i] : dstStart + i; 1001 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 1002 } 1003 } 1004 } 1005 PetscFunctionReturn(0); 1006 #else 1007 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1008 #endif 1009 } 1010 1011 /*============================================================================= 1012 Pack/Unpack/Fetch/Scatter routines 1013 ============================================================================*/ 1014 1015 /* Pack rootdata to rootbuf 1016 Input Arguments: 1017 + sf - The SF this packing works on. 1018 . link - It gives the memtype of the roots and also provides root buffer. 1019 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1020 - rootdata - Where to read the roots. 1021 1022 Notes: 1023 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1024 in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1025 */ 1026 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1027 { 1028 PetscErrorCode ierr; 1029 const PetscInt *rootindices = NULL; 1030 PetscInt count,start; 1031 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1032 PetscMemType rootmtype = link->rootmtype; 1033 PetscSFPackOpt opt = NULL; 1034 1035 PetscFunctionBegin; 1036 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1037 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1038 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1039 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1040 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1041 } 1042 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1043 PetscFunctionReturn(0); 1044 } 1045 1046 /* Pack leafdata to leafbuf */ 1047 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1048 { 1049 PetscErrorCode ierr; 1050 const PetscInt *leafindices = NULL; 1051 PetscInt count,start; 1052 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1053 PetscMemType leafmtype = link->leafmtype; 1054 PetscSFPackOpt opt = NULL; 1055 1056 PetscFunctionBegin; 1057 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1058 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1059 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1060 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1061 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1062 } 1063 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1064 PetscFunctionReturn(0); 1065 } 1066 1067 /* Pack rootdata to rootbuf, which are in the same memory space */ 1068 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1069 { 1070 PetscErrorCode ierr; 1071 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1072 1073 PetscFunctionBegin; 1074 if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1075 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1076 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1077 } 1078 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1079 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 1080 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1081 PetscFunctionReturn(0); 1082 } 1083 /* Pack leafdata to leafbuf, which are in the same memory space */ 1084 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1085 { 1086 PetscErrorCode ierr; 1087 1088 PetscFunctionBegin; 1089 if (scope == PETSCSF_REMOTE) { 1090 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1091 if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1092 } 1093 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1094 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 1095 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1096 PetscFunctionReturn(0); 1097 } 1098 1099 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1100 { 1101 PetscErrorCode ierr; 1102 const PetscInt *rootindices = NULL; 1103 PetscInt count,start; 1104 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1105 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1106 PetscMemType rootmtype = link->rootmtype; 1107 PetscSFPackOpt opt = NULL; 1108 1109 PetscFunctionBegin; 1110 if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1111 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1112 if (UnpackAndOp) { 1113 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1114 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1115 } else { 1116 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1117 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1118 } 1119 } 1120 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1121 PetscFunctionReturn(0); 1122 } 1123 1124 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1125 { 1126 PetscErrorCode ierr; 1127 const PetscInt *leafindices = NULL; 1128 PetscInt count,start; 1129 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1130 PetscMemType leafmtype = link->leafmtype; 1131 PetscSFPackOpt opt = NULL; 1132 1133 PetscFunctionBegin; 1134 if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1135 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1136 if (UnpackAndOp) { 1137 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1138 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1139 } else { 1140 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1141 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1142 } 1143 } 1144 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1145 PetscFunctionReturn(0); 1146 } 1147 /* Unpack rootbuf to rootdata, which are in the same memory space */ 1148 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1149 { 1150 PetscErrorCode ierr; 1151 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1152 1153 PetscFunctionBegin; 1154 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1155 if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1156 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1157 if (scope == PETSCSF_REMOTE) { 1158 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1159 if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1160 } 1161 PetscFunctionReturn(0); 1162 } 1163 1164 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1165 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1166 { 1167 PetscErrorCode ierr; 1168 1169 PetscFunctionBegin; 1170 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1171 if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);} 1172 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1173 if (scope == PETSCSF_REMOTE) { 1174 if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1175 if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1176 } 1177 PetscFunctionReturn(0); 1178 } 1179 1180 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1181 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1182 { 1183 PetscErrorCode ierr; 1184 const PetscInt *rootindices = NULL; 1185 PetscInt count,start; 1186 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1187 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1188 PetscMemType rootmtype = link->rootmtype; 1189 PetscSFPackOpt opt = NULL; 1190 1191 PetscFunctionBegin; 1192 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1193 if (bas->rootbuflen[PETSCSF_REMOTE]) { 1194 /* Do FetchAndOp on rootdata with rootbuf */ 1195 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 1196 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1197 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1198 } 1199 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1200 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1201 PetscFunctionReturn(0); 1202 } 1203 1204 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1205 { 1206 PetscErrorCode ierr; 1207 const PetscInt *rootindices = NULL,*leafindices = NULL; 1208 PetscInt count,rootstart,leafstart; 1209 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1210 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1211 PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1212 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1213 PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1214 char *srcbuf = NULL,*dstbuf = NULL; 1215 PetscBool dstdups; 1216 1217 if (!buflen) PetscFunctionReturn(0); 1218 if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1219 if (direction == PETSCSF_ROOT2LEAF) { 1220 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1221 srcmtype = rootmtype; 1222 srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1223 dstmtype = leafmtype; 1224 dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1225 } else { 1226 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1227 srcmtype = leafmtype; 1228 srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1229 dstmtype = rootmtype; 1230 dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1231 } 1232 ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 1233 /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1234 if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 1235 if (direction == PETSCSF_ROOT2LEAF) { 1236 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1237 } else { 1238 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1239 } 1240 } else { 1241 dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1242 dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1243 ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1244 if (ScatterAndOp) { 1245 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1246 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1247 if (direction == PETSCSF_ROOT2LEAF) { 1248 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1249 } else { 1250 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1251 } 1252 } else { 1253 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1254 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1255 if (direction == PETSCSF_ROOT2LEAF) { 1256 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1257 } else { 1258 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1259 } 1260 } 1261 } 1262 PetscFunctionReturn(0); 1263 } 1264 1265 /* Fetch rootdata to leafdata and leafupdate locally */ 1266 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1267 { 1268 PetscErrorCode ierr; 1269 const PetscInt *rootindices = NULL,*leafindices = NULL; 1270 PetscInt count,rootstart,leafstart; 1271 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1272 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1273 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1274 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1275 1276 PetscFunctionBegin; 1277 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1278 if (rootmtype != leafmtype) { 1279 /* The local communication has to go through pack and unpack */ 1280 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1281 } else { 1282 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1283 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1284 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1285 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1286 } 1287 PetscFunctionReturn(0); 1288 } 1289 1290 /* 1291 Create per-rank pack/unpack optimizations based on indice patterns 1292 1293 Input Parameters: 1294 + n - Number of destination ranks 1295 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1296 - idx - [*] Array storing indices 1297 1298 Output Parameters: 1299 + opt - Pack optimizations. NULL if no optimizations. 1300 */ 1301 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1302 { 1303 PetscErrorCode ierr; 1304 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1305 PetscBool optimizable = PETSC_TRUE; 1306 PetscSFPackOpt opt; 1307 1308 PetscFunctionBegin; 1309 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1310 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1311 opt->n = opt->array[0] = n; 1312 opt->offset = opt->array + 1; 1313 opt->start = opt->array + n + 2; 1314 opt->dx = opt->array + 2*n + 2; 1315 opt->dy = opt->array + 3*n + 2; 1316 opt->dz = opt->array + 4*n + 2; 1317 opt->X = opt->array + 5*n + 2; 1318 opt->Y = opt->array + 6*n + 2; 1319 1320 for (r=0; r<n; r++) { /* For each destination rank */ 1321 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1322 p = offset[r]; 1323 start = idx[p]; /* First index for this rank */ 1324 p++; 1325 1326 /* Search in X dimension */ 1327 for (dx=1; dx<m; dx++,p++) { 1328 if (start+dx != idx[p]) break; 1329 } 1330 1331 dydz = m/dx; 1332 X = dydz > 1 ? (idx[p]-start) : dx; 1333 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1334 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1335 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1336 for (i=0; i<dx; i++,p++) { 1337 if (start+X*dy+i != idx[p]) { 1338 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1339 else goto Z_dimension; 1340 } 1341 } 1342 } 1343 1344 Z_dimension: 1345 dz = m/(dx*dy); 1346 Y = dz > 1 ? (idx[p]-start)/X : dy; 1347 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1348 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1349 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1350 for (j=0; j<dy; j++) { 1351 for (i=0; i<dx; i++,p++) { 1352 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1353 } 1354 } 1355 } 1356 opt->start[r] = start; 1357 opt->dx[r] = dx; 1358 opt->dy[r] = dy; 1359 opt->dz[r] = dz; 1360 opt->X[r] = X; 1361 opt->Y[r] = Y; 1362 } 1363 1364 finish: 1365 /* If not optimizable, free arrays to save memory */ 1366 if (!n || !optimizable) { 1367 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1368 ierr = PetscFree(opt);CHKERRQ(ierr); 1369 *out = NULL; 1370 } else { 1371 opt->offset[0] = 0; 1372 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1373 *out = opt; 1374 } 1375 PetscFunctionReturn(0); 1376 } 1377 1378 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1379 { 1380 PetscErrorCode ierr; 1381 PetscSFPackOpt opt = *out; 1382 1383 PetscFunctionBegin; 1384 if (opt) { 1385 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1386 ierr = PetscFree(opt);CHKERRQ(ierr); 1387 *out = NULL; 1388 } 1389 PetscFunctionReturn(0); 1390 } 1391 1392 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1393 { 1394 PetscErrorCode ierr; 1395 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1396 PetscInt i,j; 1397 1398 PetscFunctionBegin; 1399 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1400 for (i=0; i<2; i++) { /* Set defaults */ 1401 sf->leafstart[i] = 0; 1402 sf->leafcontig[i] = PETSC_TRUE; 1403 sf->leafdups[i] = PETSC_FALSE; 1404 bas->rootstart[i] = 0; 1405 bas->rootcontig[i] = PETSC_TRUE; 1406 bas->rootdups[i] = PETSC_FALSE; 1407 } 1408 1409 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1410 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1411 1412 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1413 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1414 1415 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1416 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1417 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1418 } 1419 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1420 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1421 } 1422 1423 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1424 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1425 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1426 1427 /* Are root indices for self and remote contiguous? */ 1428 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1429 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1430 1431 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1432 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1433 1434 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1435 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1436 } 1437 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1438 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1439 } 1440 1441 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1442 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1443 1444 #if defined(PETSC_HAVE_DEVICE) 1445 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1446 if (PetscDefined(HAVE_DEVICE)) { 1447 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1448 if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1449 if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1450 if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1451 if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1452 } 1453 #endif 1454 PetscFunctionReturn(0); 1455 } 1456 1457 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1458 { 1459 PetscErrorCode ierr; 1460 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1461 PetscInt i; 1462 1463 PetscFunctionBegin; 1464 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1465 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1466 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1467 #if defined(PETSC_HAVE_DEVICE) 1468 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1469 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1470 #endif 1471 } 1472 PetscFunctionReturn(0); 1473 } 1474