1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 3 4 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 5 6 #if defined(PETSC_HAVE_CUDA) 7 #include <cuda_runtime.h> 8 #include <petsccublas.h> 9 #endif 10 /* 11 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 12 * therefore we pack data types manually. This file defines packing routines for the standard data types. 13 */ 14 15 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 16 17 /* Operations working like s += t */ 18 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 19 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 20 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 21 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 22 /* Ref MPI MAXLOC */ 23 #define OP_XLOC(op,s,t) \ 24 do { \ 25 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 26 else if (!((s).u op (t).u)) s = t; \ 27 } while (0) 28 29 /* DEF_PackFunc - macro defining a Pack routine 30 31 Arguments of the macro: 32 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 33 .BS Block size for vectorization. It is a factor of bsz. 34 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 35 36 Arguments of the Pack routine: 37 +count Number of indices in idx[]. 38 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 39 .opt Per-pack optimization plan. NULL means no such plan. 40 .idx Indices of entries to packed. 41 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 42 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 43 -packed Address of the packed data. 44 */ 45 #define DEF_PackFunc(Type,BS,EQ) \ 46 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 47 { \ 48 PetscErrorCode ierr; \ 49 const Type *u = (const Type*)unpacked,*u2; \ 50 Type *p = (Type*)packed,*p2; \ 51 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 52 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 53 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 54 PetscFunctionBegin; \ 55 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 56 else if (opt) { /* has optimizations available */ \ 57 p2 = p; \ 58 for (r=0; r<opt->n; r++) { \ 59 u2 = u + opt->start[r]*MBS; \ 60 X = opt->X[r]; \ 61 Y = opt->Y[r]; \ 62 for (k=0; k<opt->dz[r]; k++) \ 63 for (j=0; j<opt->dy[r]; j++) { \ 64 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 65 p2 += opt->dx[r]*MBS; \ 66 } \ 67 } \ 68 } else { \ 69 for (i=0; i<count; i++) \ 70 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 71 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 72 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 73 } \ 74 PetscFunctionReturn(0); \ 75 } 76 77 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 78 and inserts into a sparse array. 79 80 Arguments: 81 .Type Type of the data 82 .BS Block size for vectorization 83 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 84 85 Notes: 86 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 87 */ 88 #define DEF_UnpackFunc(Type,BS,EQ) \ 89 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 90 { \ 91 PetscErrorCode ierr; \ 92 Type *u = (Type*)unpacked,*u2; \ 93 const Type *p = (const Type*)packed; \ 94 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 95 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 96 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 97 PetscFunctionBegin; \ 98 if (!idx) { \ 99 u += start*MBS; \ 100 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 101 } else if (opt) { /* has optimizations available */ \ 102 for (r=0; r<opt->n; r++) { \ 103 u2 = u + opt->start[r]*MBS; \ 104 X = opt->X[r]; \ 105 Y = opt->Y[r]; \ 106 for (k=0; k<opt->dz[r]; k++) \ 107 for (j=0; j<opt->dy[r]; j++) { \ 108 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 109 p += opt->dx[r]*MBS; \ 110 } \ 111 } \ 112 } else { \ 113 for (i=0; i<count; i++) \ 114 for (j=0; j<M; j++) \ 115 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 116 } \ 117 PetscFunctionReturn(0); \ 118 } 119 120 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 121 122 Arguments: 123 +Opname Name of the Op, such as Add, Mult, LAND, etc. 124 .Type Type of the data 125 .BS Block size for vectorization 126 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 127 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 128 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 129 */ 130 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 131 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 132 { \ 133 Type *u = (Type*)unpacked,*u2; \ 134 const Type *p = (const Type*)packed; \ 135 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 136 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 137 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 138 PetscFunctionBegin; \ 139 if (!idx) { \ 140 u += start*MBS; \ 141 for (i=0; i<count; i++) \ 142 for (j=0; j<M; j++) \ 143 for (k=0; k<BS; k++) \ 144 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 145 } else if (opt) { /* idx[] has patterns */ \ 146 for (r=0; r<opt->n; r++) { \ 147 u2 = u + opt->start[r]*MBS; \ 148 X = opt->X[r]; \ 149 Y = opt->Y[r]; \ 150 for (k=0; k<opt->dz[r]; k++) \ 151 for (j=0; j<opt->dy[r]; j++) { \ 152 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 153 p += opt->dx[r]*MBS; \ 154 } \ 155 } \ 156 } else { \ 157 for (i=0; i<count; i++) \ 158 for (j=0; j<M; j++) \ 159 for (k=0; k<BS; k++) \ 160 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 161 } \ 162 PetscFunctionReturn(0); \ 163 } 164 165 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 166 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 167 { \ 168 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 169 PetscInt i,j,k,r,l,bs=link->bs; \ 170 const PetscInt M = (EQ) ? 1 : bs/BS; \ 171 const PetscInt MBS = M*BS; \ 172 PetscFunctionBegin; \ 173 for (i=0; i<count; i++) { \ 174 r = (!idx ? start+i : idx[i])*MBS; \ 175 l = i*MBS; \ 176 for (j=0; j<M; j++) \ 177 for (k=0; k<BS; k++) { \ 178 tmp = u[r+j*BS+k]; \ 179 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 180 p[l+j*BS+k] = tmp; \ 181 } \ 182 } \ 183 PetscFunctionReturn(0); \ 184 } 185 186 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 187 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 188 { \ 189 PetscErrorCode ierr; \ 190 const Type *u = (const Type*)src; \ 191 Type *v = (Type*)dst; \ 192 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 193 const PetscInt M = (EQ) ? 1 : bs/BS; \ 194 const PetscInt MBS = M*BS; \ 195 PetscFunctionBegin; \ 196 if (!srcIdx) { /* src is contiguous */ \ 197 u += srcStart*MBS; \ 198 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 199 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 200 u += srcOpt->start[0]*MBS; \ 201 v += dstStart*MBS; \ 202 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 203 for (k=0; k<srcOpt->dz[0]; k++) \ 204 for (j=0; j<srcOpt->dy[0]; j++) { \ 205 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 206 v += srcOpt->dx[0]*MBS; \ 207 } \ 208 } else { /* all other cases */ \ 209 for (i=0; i<count; i++) { \ 210 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 211 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 212 for (j=0; j<M; j++) \ 213 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 214 } \ 215 } \ 216 PetscFunctionReturn(0); \ 217 } 218 219 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 220 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 221 { \ 222 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 223 const Type *ldata = (const Type*)leafdata; \ 224 PetscInt i,j,k,r,l,bs = link->bs; \ 225 const PetscInt M = (EQ) ? 1 : bs/BS; \ 226 const PetscInt MBS = M*BS; \ 227 PetscFunctionBegin; \ 228 for (i=0; i<count; i++) { \ 229 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 230 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 231 for (j=0; j<M; j++) \ 232 for (k=0; k<BS; k++) { \ 233 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 234 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 235 } \ 236 } \ 237 PetscFunctionReturn(0); \ 238 } 239 240 /* Pack, Unpack/Fetch ops */ 241 #define DEF_Pack(Type,BS,EQ) \ 242 DEF_PackFunc(Type,BS,EQ) \ 243 DEF_UnpackFunc(Type,BS,EQ) \ 244 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 245 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 246 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 247 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 248 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 249 } 250 251 /* Add, Mult ops */ 252 #define DEF_Add(Type,BS,EQ) \ 253 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 256 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 257 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 258 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 259 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 260 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 261 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 262 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 263 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 264 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 265 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 266 } 267 268 /* Max, Min ops */ 269 #define DEF_Cmp(Type,BS,EQ) \ 270 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 271 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 272 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 273 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 274 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 275 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 276 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 277 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 278 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 279 } 280 281 /* Logical ops. 282 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 283 the compilation warning "empty macro arguments are undefined in ISO C90" 284 */ 285 #define DEF_Log(Type,BS,EQ) \ 286 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 287 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 288 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 289 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 290 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 291 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 292 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 293 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 294 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 295 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 296 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 297 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 298 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 299 } 300 301 /* Bitwise ops */ 302 #define DEF_Bit(Type,BS,EQ) \ 303 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 304 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 305 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 306 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 307 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 308 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 309 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 310 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 311 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 312 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 313 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 314 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 315 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 316 } 317 318 /* Maxloc, Minloc ops */ 319 #define DEF_Xloc(Type,BS,EQ) \ 320 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 321 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 322 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 323 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 324 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 325 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 326 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 327 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 328 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 329 } 330 331 #define DEF_IntegerType(Type,BS,EQ) \ 332 DEF_Pack(Type,BS,EQ) \ 333 DEF_Add(Type,BS,EQ) \ 334 DEF_Cmp(Type,BS,EQ) \ 335 DEF_Log(Type,BS,EQ) \ 336 DEF_Bit(Type,BS,EQ) \ 337 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 338 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 340 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 341 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 342 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 343 } 344 345 #define DEF_RealType(Type,BS,EQ) \ 346 DEF_Pack(Type,BS,EQ) \ 347 DEF_Add(Type,BS,EQ) \ 348 DEF_Cmp(Type,BS,EQ) \ 349 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 350 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 351 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 352 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 353 } 354 355 #if defined(PETSC_HAVE_COMPLEX) 356 #define DEF_ComplexType(Type,BS,EQ) \ 357 DEF_Pack(Type,BS,EQ) \ 358 DEF_Add(Type,BS,EQ) \ 359 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 360 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 361 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 362 } 363 #endif 364 365 #define DEF_DumbType(Type,BS,EQ) \ 366 DEF_Pack(Type,BS,EQ) \ 367 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 368 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 369 } 370 371 /* Maxloc, Minloc */ 372 #define DEF_PairType(Type,BS,EQ) \ 373 DEF_Pack(Type,BS,EQ) \ 374 DEF_Xloc(Type,BS,EQ) \ 375 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 376 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 377 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 378 } 379 380 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 381 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 382 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 383 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 384 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 385 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 386 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 387 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 388 389 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 390 DEF_IntegerType(int,1,1) 391 DEF_IntegerType(int,2,1) 392 DEF_IntegerType(int,4,1) 393 DEF_IntegerType(int,8,1) 394 DEF_IntegerType(int,1,0) 395 DEF_IntegerType(int,2,0) 396 DEF_IntegerType(int,4,0) 397 DEF_IntegerType(int,8,0) 398 #endif 399 400 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 401 typedef signed char SignedChar; 402 DEF_IntegerType(SignedChar,1,1) 403 DEF_IntegerType(SignedChar,2,1) 404 DEF_IntegerType(SignedChar,4,1) 405 DEF_IntegerType(SignedChar,8,1) 406 DEF_IntegerType(SignedChar,1,0) 407 DEF_IntegerType(SignedChar,2,0) 408 DEF_IntegerType(SignedChar,4,0) 409 DEF_IntegerType(SignedChar,8,0) 410 411 typedef unsigned char UnsignedChar; 412 DEF_IntegerType(UnsignedChar,1,1) 413 DEF_IntegerType(UnsignedChar,2,1) 414 DEF_IntegerType(UnsignedChar,4,1) 415 DEF_IntegerType(UnsignedChar,8,1) 416 DEF_IntegerType(UnsignedChar,1,0) 417 DEF_IntegerType(UnsignedChar,2,0) 418 DEF_IntegerType(UnsignedChar,4,0) 419 DEF_IntegerType(UnsignedChar,8,0) 420 421 DEF_RealType(PetscReal,1,1) 422 DEF_RealType(PetscReal,2,1) 423 DEF_RealType(PetscReal,4,1) 424 DEF_RealType(PetscReal,8,1) 425 DEF_RealType(PetscReal,1,0) 426 DEF_RealType(PetscReal,2,0) 427 DEF_RealType(PetscReal,4,0) 428 DEF_RealType(PetscReal,8,0) 429 430 #if defined(PETSC_HAVE_COMPLEX) 431 DEF_ComplexType(PetscComplex,1,1) 432 DEF_ComplexType(PetscComplex,2,1) 433 DEF_ComplexType(PetscComplex,4,1) 434 DEF_ComplexType(PetscComplex,8,1) 435 DEF_ComplexType(PetscComplex,1,0) 436 DEF_ComplexType(PetscComplex,2,0) 437 DEF_ComplexType(PetscComplex,4,0) 438 DEF_ComplexType(PetscComplex,8,0) 439 #endif 440 441 #define PairType(Type1,Type2) Type1##_##Type2 442 typedef struct {int u; int i;} PairType(int,int); 443 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 444 DEF_PairType(PairType(int,int),1,1) 445 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 446 447 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 448 DEF_DumbType(char,1,1) 449 DEF_DumbType(char,2,1) 450 DEF_DumbType(char,4,1) 451 DEF_DumbType(char,1,0) 452 DEF_DumbType(char,2,0) 453 DEF_DumbType(char,4,0) 454 455 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 456 DEF_DumbType(DumbInt,1,1) 457 DEF_DumbType(DumbInt,2,1) 458 DEF_DumbType(DumbInt,4,1) 459 DEF_DumbType(DumbInt,8,1) 460 DEF_DumbType(DumbInt,1,0) 461 DEF_DumbType(DumbInt,2,0) 462 DEF_DumbType(DumbInt,4,0) 463 DEF_DumbType(DumbInt,8,0) 464 465 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 466 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 467 { 468 int ierr; 469 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 470 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 471 return MPI_SUCCESS; 472 } 473 #endif 474 475 /* 476 The routine Creates a communication link for the given operation. It first looks up its link cache. If 477 there is a free & suitable one, it uses it. Otherwise it creates a new one. 478 479 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 480 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 481 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 482 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 483 484 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 485 486 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 487 488 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 489 need pack/unpack data. 490 */ 491 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 492 { 493 PetscErrorCode ierr; 494 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 495 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 496 PetscSFLink *p,link; 497 PetscSFDirection direction; 498 MPI_Request *reqs = NULL; 499 PetscBool match,rootdirect[2],leafdirect[2]; 500 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 501 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 502 503 PetscFunctionBegin; 504 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 505 506 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 507 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 508 if (sfop == PETSCSF_BCAST) { 509 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 510 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 511 } else if (sfop == PETSCSF_REDUCE) { 512 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 513 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 514 } else { /* PETSCSF_FETCH */ 515 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 516 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 517 } 518 } 519 520 if (sf->use_gpu_aware_mpi) { 521 rootmtype_mpi = rootmtype; 522 leafmtype_mpi = leafmtype; 523 } else { 524 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 525 } 526 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 527 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 528 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 529 530 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 531 nrootreqs = bas->nrootreqs; 532 nleafreqs = sf->nleafreqs; 533 534 /* Look for free links in cache */ 535 for (p=&bas->avail; (link=*p); p=&link->next) { 536 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 537 if (match) { 538 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 539 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 540 */ 541 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 542 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 543 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 544 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 545 } 546 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 547 reqs = link->leafreqs[direction][leafmtype][1]; 548 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 549 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 550 } 551 *p = link->next; /* Remove from available list */ 552 goto found; 553 } 554 } 555 556 ierr = PetscNew(&link);CHKERRQ(ierr); 557 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 558 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 559 560 nreqs = (nrootreqs+nleafreqs)*8; 561 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 562 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 563 564 for (i=0; i<2; i++) { /* Two communication directions */ 565 for (j=0; j<2; j++) { /* Two memory types */ 566 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 567 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 568 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 569 } 570 } 571 } 572 573 found: 574 #if defined(PETSC_HAVE_DEVICE) 575 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 576 #endif 577 578 /* Allocate buffers along root/leafdata */ 579 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 580 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 581 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 582 if (bas->rootbuflen[i]) { 583 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 584 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 585 } else { /* Have to have a separate rootbuf */ 586 if (!link->rootbuf_alloc[i][rootmtype]) { 587 ierr = PetscSFMalloc(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 588 } 589 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 590 } 591 } 592 593 if (sf->leafbuflen[i]) { 594 if (leafdirect[i]) { 595 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 596 } else { 597 if (!link->leafbuf_alloc[i][leafmtype]) { 598 ierr = PetscSFMalloc(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 599 } 600 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 601 } 602 } 603 } 604 605 #if defined(PETSC_HAVE_DEVICE) 606 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 607 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 608 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 609 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 610 } 611 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 612 } 613 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 614 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 615 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 616 } 617 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 618 } 619 #endif 620 621 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 622 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 623 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 624 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 625 } 626 627 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 628 link->leafdata = leafdata; 629 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 630 link->rootdirect[i] = rootdirect[i]; 631 link->leafdirect[i] = leafdirect[i]; 632 } 633 link->rootdirect_mpi = rootdirect_mpi; 634 link->leafdirect_mpi = leafdirect_mpi; 635 link->rootmtype = rootmtype; 636 link->leafmtype = leafmtype; 637 link->rootmtype_mpi = rootmtype_mpi; 638 link->leafmtype_mpi = leafmtype_mpi; 639 640 link->next = bas->inuse; 641 bas->inuse = link; 642 *mylink = link; 643 PetscFunctionReturn(0); 644 } 645 646 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 647 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 648 */ 649 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 650 { 651 PetscErrorCode ierr; 652 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 653 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 654 const PetscInt *rootoffset,*leafoffset; 655 PetscMPIInt n; 656 MPI_Aint disp; 657 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 658 MPI_Datatype unit = link->unit; 659 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 660 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 661 662 PetscFunctionBegin; 663 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 664 if (sf->persistent) { 665 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 666 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 667 if (direction == PETSCSF_LEAF2ROOT) { 668 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 669 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 670 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 671 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 672 } 673 } else { /* PETSCSF_ROOT2LEAF */ 674 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 675 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 676 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 677 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 678 } 679 } 680 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 681 } 682 683 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 684 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 685 if (direction == PETSCSF_LEAF2ROOT) { 686 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 687 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 688 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 689 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 690 } 691 } else { /* PETSCSF_ROOT2LEAF */ 692 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 693 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 694 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 695 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 696 } 697 } 698 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 699 } 700 } 701 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 702 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 703 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 704 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 705 PetscFunctionReturn(0); 706 } 707 708 709 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 710 { 711 PetscErrorCode ierr; 712 PetscSFLink link,*p; 713 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 714 715 PetscFunctionBegin; 716 /* Look for types in cache */ 717 for (p=&bas->inuse; (link=*p); p=&link->next) { 718 PetscBool match; 719 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 720 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 721 switch (cmode) { 722 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 723 case PETSC_USE_POINTER: break; 724 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 725 } 726 *mylink = link; 727 PetscFunctionReturn(0); 728 } 729 } 730 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 731 PetscFunctionReturn(0); 732 } 733 734 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 735 { 736 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 737 738 PetscFunctionBegin; 739 (*link)->rootdata = NULL; 740 (*link)->leafdata = NULL; 741 (*link)->next = bas->avail; 742 bas->avail = *link; 743 *link = NULL; 744 PetscFunctionReturn(0); 745 } 746 747 /* Destroy all links chained in 'avail' */ 748 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 749 { 750 PetscErrorCode ierr; 751 PetscSFLink link = *avail,next; 752 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 753 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 754 755 PetscFunctionBegin; 756 for (; link; link=next) { 757 next = link->next; 758 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 759 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 760 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 761 } 762 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 763 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 764 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 765 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 766 #if defined(PETSC_HAVE_DEVICE) 767 ierr = PetscSFFree(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 768 ierr = PetscSFFree(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 769 #endif 770 } 771 #if defined(PETSC_HAVE_DEVICE) 772 if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);} 773 #if defined(PETSC_HAVE_CUDA) 774 if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;} 775 #elif defined(PETSC_HAVE_HIP) 776 if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */ 777 #endif 778 #endif 779 ierr = PetscFree(link);CHKERRQ(ierr); 780 } 781 *avail = NULL; 782 PetscFunctionReturn(0); 783 } 784 785 /* Error out on unsupported overlapped communications */ 786 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 787 { 788 PetscErrorCode ierr; 789 PetscSFLink link,*p; 790 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 791 PetscBool match; 792 793 PetscFunctionBegin; 794 if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0); 795 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 796 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 797 */ 798 if (rootdata || leafdata) { 799 for (p=&bas->inuse; (link=*p); p=&link->next) { 800 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 801 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 802 } 803 } 804 PetscFunctionReturn(0); 805 } 806 807 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 808 { 809 PetscErrorCode ierr; 810 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 811 PetscBool is2Int,is2PetscInt; 812 PetscMPIInt ni,na,nd,combiner; 813 #if defined(PETSC_HAVE_COMPLEX) 814 PetscInt nPetscComplex=0; 815 #endif 816 817 PetscFunctionBegin; 818 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 819 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 820 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 821 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 822 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 823 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 824 #if defined(PETSC_HAVE_COMPLEX) 825 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 826 #endif 827 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 828 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 829 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 830 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 831 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 832 link->bs = 1; /* default */ 833 834 if (is2Int) { 835 PackInit_PairType_int_int_1_1(link); 836 link->bs = 1; 837 link->unitbytes = 2*sizeof(int); 838 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 839 link->basicunit = MPI_2INT; 840 link->unit = MPI_2INT; 841 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 842 PackInit_PairType_PetscInt_PetscInt_1_1(link); 843 link->bs = 1; 844 link->unitbytes = 2*sizeof(PetscInt); 845 link->basicunit = MPIU_2INT; 846 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 847 link->unit = MPIU_2INT; 848 } else if (nPetscReal) { 849 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 850 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 851 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 852 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 853 link->bs = nPetscReal; 854 link->unitbytes = nPetscReal*sizeof(PetscReal); 855 link->basicunit = MPIU_REAL; 856 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 857 } else if (nPetscInt) { 858 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 859 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 860 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 861 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 862 link->bs = nPetscInt; 863 link->unitbytes = nPetscInt*sizeof(PetscInt); 864 link->basicunit = MPIU_INT; 865 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 866 #if defined(PETSC_USE_64BIT_INDICES) 867 } else if (nInt) { 868 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 869 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 870 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 871 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 872 link->bs = nInt; 873 link->unitbytes = nInt*sizeof(int); 874 link->basicunit = MPI_INT; 875 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 876 #endif 877 } else if (nSignedChar) { 878 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 879 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 880 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 881 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 882 link->bs = nSignedChar; 883 link->unitbytes = nSignedChar*sizeof(SignedChar); 884 link->basicunit = MPI_SIGNED_CHAR; 885 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 886 } else if (nUnsignedChar) { 887 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 888 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 889 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 890 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 891 link->bs = nUnsignedChar; 892 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 893 link->basicunit = MPI_UNSIGNED_CHAR; 894 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 895 #if defined(PETSC_HAVE_COMPLEX) 896 } else if (nPetscComplex) { 897 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 898 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 899 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 900 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 901 link->bs = nPetscComplex; 902 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 903 link->basicunit = MPIU_COMPLEX; 904 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 905 #endif 906 } else { 907 MPI_Aint lb,nbyte; 908 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 909 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 910 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 911 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 912 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 913 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 914 link->bs = nbyte; 915 link->unitbytes = nbyte; 916 link->basicunit = MPI_BYTE; 917 } else { 918 nInt = nbyte / sizeof(int); 919 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 920 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 921 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 922 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 923 link->bs = nInt; 924 link->unitbytes = nbyte; 925 link->basicunit = MPI_INT; 926 } 927 if (link->isbuiltin) link->unit = unit; 928 } 929 930 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 935 { 936 PetscFunctionBegin; 937 *UnpackAndOp = NULL; 938 if (mtype == PETSC_MEMTYPE_HOST) { 939 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 940 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 941 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 942 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 943 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 944 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 945 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 946 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 947 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 948 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 949 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 950 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 951 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 952 } 953 #if defined(PETSC_HAVE_DEVICE) 954 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 955 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 956 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 957 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 958 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 959 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 960 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 961 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 962 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 963 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 964 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 965 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 966 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 967 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 968 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 969 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 970 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 971 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 972 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 973 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 974 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 975 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 976 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 977 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 978 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 979 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 980 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 981 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 982 } 983 #endif 984 PetscFunctionReturn(0); 985 } 986 987 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 988 { 989 PetscFunctionBegin; 990 *ScatterAndOp = NULL; 991 if (mtype == PETSC_MEMTYPE_HOST) { 992 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 993 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 994 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 995 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 996 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 997 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 998 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 999 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1000 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1001 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1002 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1003 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1004 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1005 } 1006 #if defined(PETSC_HAVE_DEVICE) 1007 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1008 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1009 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1010 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1011 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1012 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1013 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1014 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1015 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1016 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1017 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1018 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1019 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1020 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1021 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1022 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1023 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1024 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1025 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1026 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1027 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1028 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1029 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1030 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1031 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1032 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1033 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1034 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1035 } 1036 #endif 1037 PetscFunctionReturn(0); 1038 } 1039 1040 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1041 { 1042 PetscFunctionBegin; 1043 *FetchAndOp = NULL; 1044 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1045 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1046 #if defined(PETSC_HAVE_DEVICE) 1047 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1048 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1049 #endif 1050 PetscFunctionReturn(0); 1051 } 1052 1053 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1054 { 1055 PetscFunctionBegin; 1056 *FetchAndOpLocal = NULL; 1057 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1058 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1059 #if defined(PETSC_HAVE_DEVICE) 1060 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1061 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1062 #endif 1063 PetscFunctionReturn(0); 1064 } 1065 1066 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1067 { 1068 PetscErrorCode ierr; 1069 PetscLogDouble flops; 1070 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1071 1072 1073 PetscFunctionBegin; 1074 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1075 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1076 #if defined(PETSC_HAVE_DEVICE) 1077 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1078 #endif 1079 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1080 } 1081 PetscFunctionReturn(0); 1082 } 1083 1084 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1085 { 1086 PetscLogDouble flops; 1087 PetscErrorCode ierr; 1088 1089 PetscFunctionBegin; 1090 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1091 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1092 #if defined(PETSC_HAVE_DEVICE) 1093 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1094 #endif 1095 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1096 } 1097 PetscFunctionReturn(0); 1098 } 1099 1100 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1101 Input Arguments: 1102 +sf - The StarForest 1103 .link - The link 1104 .count - Number of entries to unpack 1105 .start - The first index, significent when indices=NULL 1106 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1107 .buf - A contiguous buffer to unpack from 1108 -op - Operation after unpack 1109 1110 Output Arguments: 1111 .data - The data to unpack to 1112 */ 1113 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1114 { 1115 PetscFunctionBegin; 1116 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1117 { 1118 PetscErrorCode ierr; 1119 PetscInt i; 1120 PetscMPIInt n; 1121 if (indices) { 1122 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1123 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1124 */ 1125 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1126 } else { 1127 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1128 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1129 } 1130 } 1131 #else 1132 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1133 #endif 1134 PetscFunctionReturn(0); 1135 } 1136 1137 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1138 { 1139 PetscFunctionBegin; 1140 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1141 { 1142 PetscErrorCode ierr; 1143 PetscInt i,disp; 1144 if (!srcIdx) { 1145 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1146 } else { 1147 for (i=0; i<count; i++) { 1148 disp = dstIdx? dstIdx[i] : dstStart + i; 1149 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1150 } 1151 } 1152 } 1153 #else 1154 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1155 #endif 1156 PetscFunctionReturn(0); 1157 } 1158 1159 /*============================================================================= 1160 Pack/Unpack/Fetch/Scatter routines 1161 ============================================================================*/ 1162 1163 /* Pack rootdata to rootbuf 1164 Input Arguments: 1165 + sf - The SF this packing works on. 1166 . link - It gives the memtype of the roots and also provides root buffer. 1167 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1168 - rootdata - Where to read the roots. 1169 1170 Notes: 1171 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1172 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1173 */ 1174 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1175 { 1176 PetscErrorCode ierr; 1177 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1178 const PetscInt *rootindices = NULL; 1179 PetscInt count,start; 1180 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1181 PetscMemType rootmtype = link->rootmtype; 1182 PetscSFPackOpt opt = NULL; 1183 1184 PetscFunctionBegin; 1185 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1186 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1187 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1188 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1189 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1190 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1191 } 1192 if (scope == PETSCSF_REMOTE) { 1193 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1194 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1195 } 1196 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 /* Pack leafdata to leafbuf */ 1201 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1202 { 1203 PetscErrorCode ierr; 1204 const PetscInt *leafindices = NULL; 1205 PetscInt count,start; 1206 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1207 PetscMemType leafmtype = link->leafmtype; 1208 PetscSFPackOpt opt = NULL; 1209 1210 PetscFunctionBegin; 1211 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1212 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1213 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1214 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1215 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1216 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1217 } 1218 if (scope == PETSCSF_REMOTE) { 1219 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1220 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1221 } 1222 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 /* Unpack rootbuf to rootdata */ 1227 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1228 { 1229 PetscErrorCode ierr; 1230 const PetscInt *rootindices = NULL; 1231 PetscInt count,start; 1232 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1233 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1234 PetscMemType rootmtype = link->rootmtype; 1235 PetscSFPackOpt opt = NULL; 1236 1237 PetscFunctionBegin; 1238 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1239 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1240 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1241 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1242 if (UnpackAndOp) { 1243 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1244 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1245 } else { 1246 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1247 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1248 } 1249 } 1250 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1251 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1252 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1253 PetscFunctionReturn(0); 1254 } 1255 1256 /* Unpack leafbuf to leafdata */ 1257 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1258 { 1259 PetscErrorCode ierr; 1260 const PetscInt *leafindices = NULL; 1261 PetscInt count,start; 1262 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1263 PetscMemType leafmtype = link->leafmtype; 1264 PetscSFPackOpt opt = NULL; 1265 1266 PetscFunctionBegin; 1267 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1268 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1269 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1270 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1271 if (UnpackAndOp) { 1272 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1273 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1274 } else { 1275 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1276 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1277 } 1278 } 1279 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1280 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1281 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1282 PetscFunctionReturn(0); 1283 } 1284 1285 /* FetchAndOp rootdata with rootbuf */ 1286 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1287 { 1288 PetscErrorCode ierr; 1289 const PetscInt *rootindices = NULL; 1290 PetscInt count,start; 1291 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1292 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1293 PetscMemType rootmtype = link->rootmtype; 1294 PetscSFPackOpt opt = NULL; 1295 1296 PetscFunctionBegin; 1297 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1298 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1299 if (bas->rootbuflen[scope]) { 1300 /* Do FetchAndOp on rootdata with rootbuf */ 1301 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1302 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1303 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1304 } 1305 if (scope == PETSCSF_REMOTE) { 1306 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1307 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1308 } 1309 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1310 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1311 PetscFunctionReturn(0); 1312 } 1313 1314 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1315 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1316 { 1317 PetscErrorCode ierr; 1318 const PetscInt *rootindices = NULL,*leafindices = NULL; 1319 PetscInt count,rootstart,leafstart; 1320 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1321 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1322 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1323 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1324 1325 PetscFunctionBegin; 1326 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1327 if (rootmtype != leafmtype) { /* Uncommon case */ 1328 /* The local communication has to go through pack and unpack */ 1329 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1330 ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1331 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1332 } else { 1333 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1334 if (ScatterAndOp) { 1335 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1336 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1337 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1338 } else { 1339 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1340 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1341 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1342 } 1343 } 1344 PetscFunctionReturn(0); 1345 } 1346 1347 /* Reduce leafdata to rootdata locally */ 1348 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1349 { 1350 PetscErrorCode ierr; 1351 const PetscInt *rootindices = NULL,*leafindices = NULL; 1352 PetscInt count,rootstart,leafstart; 1353 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1354 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1355 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1356 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1357 1358 PetscFunctionBegin; 1359 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1360 if (rootmtype != leafmtype) { 1361 /* The local communication has to go through pack and unpack */ 1362 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1363 ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1364 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1365 } else { 1366 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1367 if (ScatterAndOp) { 1368 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1369 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1370 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1371 } else { 1372 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1373 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1374 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1375 } 1376 } 1377 PetscFunctionReturn(0); 1378 } 1379 1380 /* Fetch rootdata to leafdata and leafupdate locally */ 1381 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1382 { 1383 PetscErrorCode ierr; 1384 const PetscInt *rootindices = NULL,*leafindices = NULL; 1385 PetscInt count,rootstart,leafstart; 1386 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1387 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1388 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1389 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1390 1391 PetscFunctionBegin; 1392 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1393 if (rootmtype != leafmtype) { 1394 /* The local communication has to go through pack and unpack */ 1395 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1396 } else { 1397 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1398 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1399 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1400 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1401 } 1402 PetscFunctionReturn(0); 1403 } 1404 1405 /* 1406 Create per-rank pack/unpack optimizations based on indice patterns 1407 1408 Input Parameters: 1409 + n - Number of destination ranks 1410 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1411 - idx - [*] Array storing indices 1412 1413 Output Parameters: 1414 + opt - Pack optimizations. NULL if no optimizations. 1415 */ 1416 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1417 { 1418 PetscErrorCode ierr; 1419 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1420 PetscBool optimizable = PETSC_TRUE; 1421 PetscSFPackOpt opt; 1422 1423 PetscFunctionBegin; 1424 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1425 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1426 opt->n = opt->array[0] = n; 1427 opt->offset = opt->array + 1; 1428 opt->start = opt->array + n + 2; 1429 opt->dx = opt->array + 2*n + 2; 1430 opt->dy = opt->array + 3*n + 2; 1431 opt->dz = opt->array + 4*n + 2; 1432 opt->X = opt->array + 5*n + 2; 1433 opt->Y = opt->array + 6*n + 2; 1434 1435 for (r=0; r<n; r++) { /* For each destination rank */ 1436 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1437 p = offset[r]; 1438 start = idx[p]; /* First index for this rank */ 1439 p++; 1440 1441 /* Search in X dimension */ 1442 for (dx=1; dx<m; dx++,p++) { 1443 if (start+dx != idx[p]) break; 1444 } 1445 1446 dydz = m/dx; 1447 X = dydz > 1 ? (idx[p]-start) : dx; 1448 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1449 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1450 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1451 for (i=0; i<dx; i++,p++) { 1452 if (start+X*dy+i != idx[p]) { 1453 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1454 else goto Z_dimension; 1455 } 1456 } 1457 } 1458 1459 Z_dimension: 1460 dz = m/(dx*dy); 1461 Y = dz > 1 ? (idx[p]-start)/X : dy; 1462 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1463 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1464 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1465 for (j=0; j<dy; j++) { 1466 for (i=0; i<dx; i++,p++) { 1467 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1468 } 1469 } 1470 } 1471 opt->start[r] = start; 1472 opt->dx[r] = dx; 1473 opt->dy[r] = dy; 1474 opt->dz[r] = dz; 1475 opt->X[r] = X; 1476 opt->Y[r] = Y; 1477 } 1478 1479 finish: 1480 /* If not optimizable, free arrays to save memory */ 1481 if (!n || !optimizable) { 1482 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1483 ierr = PetscFree(opt);CHKERRQ(ierr); 1484 *out = NULL; 1485 } else { 1486 opt->offset[0] = 0; 1487 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1488 *out = opt; 1489 } 1490 PetscFunctionReturn(0); 1491 } 1492 1493 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out) 1494 { 1495 PetscErrorCode ierr; 1496 PetscSFPackOpt opt = *out; 1497 1498 PetscFunctionBegin; 1499 if (opt) { 1500 ierr = PetscSFFree(mtype,opt->array);CHKERRQ(ierr); 1501 ierr = PetscFree(opt);CHKERRQ(ierr); 1502 *out = NULL; 1503 } 1504 PetscFunctionReturn(0); 1505 } 1506 1507 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1508 { 1509 PetscErrorCode ierr; 1510 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1511 PetscInt i,j; 1512 1513 PetscFunctionBegin; 1514 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1515 for (i=0; i<2; i++) { /* Set defaults */ 1516 sf->leafstart[i] = 0; 1517 sf->leafcontig[i] = PETSC_TRUE; 1518 sf->leafdups[i] = PETSC_FALSE; 1519 bas->rootstart[i] = 0; 1520 bas->rootcontig[i] = PETSC_TRUE; 1521 bas->rootdups[i] = PETSC_FALSE; 1522 } 1523 1524 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1525 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1526 1527 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1528 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1529 1530 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1531 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1532 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1533 } 1534 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1535 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1536 } 1537 1538 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1539 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1540 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1541 1542 /* Are root indices for self and remote contiguous? */ 1543 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1544 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1545 1546 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1547 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1548 1549 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1550 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1551 } 1552 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1553 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1554 } 1555 1556 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1557 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1558 1559 #if defined(PETSC_HAVE_DEVICE) 1560 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1561 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1562 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1563 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1564 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1565 #endif 1566 1567 PetscFunctionReturn(0); 1568 } 1569 1570 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1571 { 1572 PetscErrorCode ierr; 1573 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1574 PetscInt i; 1575 1576 PetscFunctionBegin; 1577 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1578 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1579 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1580 #if defined(PETSC_HAVE_DEVICE) 1581 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1582 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1583 #endif 1584 } 1585 PetscFunctionReturn(0); 1586 } 1587