1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 #if defined(PETSC_HAVE_CUDA) 8 #include <cuda_runtime.h> 9 #include <petsccublas.h> 10 #endif 11 /* 12 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 13 * therefore we pack data types manually. This file defines packing routines for the standard data types. 14 */ 15 16 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 17 18 /* Operations working like s += t */ 19 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 20 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 21 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 22 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 23 /* Ref MPI MAXLOC */ 24 #define OP_XLOC(op,s,t) \ 25 do { \ 26 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 27 else if (!((s).u op (t).u)) s = t; \ 28 } while (0) 29 30 /* DEF_PackFunc - macro defining a Pack routine 31 32 Arguments of the macro: 33 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 34 .BS Block size for vectorization. It is a factor of bsz. 35 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 36 37 Arguments of the Pack routine: 38 +count Number of indices in idx[]. 39 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 40 .opt Per-pack optimization plan. NULL means no such plan. 41 .idx Indices of entries to packed. 42 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 43 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 44 -packed Address of the packed data. 45 */ 46 #define DEF_PackFunc(Type,BS,EQ) \ 47 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 48 { \ 49 PetscErrorCode ierr; \ 50 const Type *u = (const Type*)unpacked,*u2; \ 51 Type *p = (Type*)packed,*p2; \ 52 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 53 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 54 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 55 PetscFunctionBegin; \ 56 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 57 else if (opt) { /* has optimizations available */ \ 58 p2 = p; \ 59 for (r=0; r<opt->n; r++) { \ 60 u2 = u + opt->start[r]*MBS; \ 61 X = opt->X[r]; \ 62 Y = opt->Y[r]; \ 63 for (k=0; k<opt->dz[r]; k++) \ 64 for (j=0; j<opt->dy[r]; j++) { \ 65 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 66 p2 += opt->dx[r]*MBS; \ 67 } \ 68 } \ 69 } else { \ 70 for (i=0; i<count; i++) \ 71 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 72 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 73 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 74 } \ 75 PetscFunctionReturn(0); \ 76 } 77 78 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 79 and inserts into a sparse array. 80 81 Arguments: 82 .Type Type of the data 83 .BS Block size for vectorization 84 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 85 86 Notes: 87 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 88 */ 89 #define DEF_UnpackFunc(Type,BS,EQ) \ 90 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 91 { \ 92 PetscErrorCode ierr; \ 93 Type *u = (Type*)unpacked,*u2; \ 94 const Type *p = (const Type*)packed; \ 95 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 96 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 97 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 98 PetscFunctionBegin; \ 99 if (!idx) { \ 100 u += start*MBS; \ 101 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 102 } else if (opt) { /* has optimizations available */ \ 103 for (r=0; r<opt->n; r++) { \ 104 u2 = u + opt->start[r]*MBS; \ 105 X = opt->X[r]; \ 106 Y = opt->Y[r]; \ 107 for (k=0; k<opt->dz[r]; k++) \ 108 for (j=0; j<opt->dy[r]; j++) { \ 109 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 110 p += opt->dx[r]*MBS; \ 111 } \ 112 } \ 113 } else { \ 114 for (i=0; i<count; i++) \ 115 for (j=0; j<M; j++) \ 116 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 117 } \ 118 PetscFunctionReturn(0); \ 119 } 120 121 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 122 123 Arguments: 124 +Opname Name of the Op, such as Add, Mult, LAND, etc. 125 .Type Type of the data 126 .BS Block size for vectorization 127 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 128 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 129 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 130 */ 131 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 132 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 133 { \ 134 Type *u = (Type*)unpacked,*u2; \ 135 const Type *p = (const Type*)packed; \ 136 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 137 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 138 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 139 PetscFunctionBegin; \ 140 if (!idx) { \ 141 u += start*MBS; \ 142 for (i=0; i<count; i++) \ 143 for (j=0; j<M; j++) \ 144 for (k=0; k<BS; k++) \ 145 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 146 } else if (opt) { /* idx[] has patterns */ \ 147 for (r=0; r<opt->n; r++) { \ 148 u2 = u + opt->start[r]*MBS; \ 149 X = opt->X[r]; \ 150 Y = opt->Y[r]; \ 151 for (k=0; k<opt->dz[r]; k++) \ 152 for (j=0; j<opt->dy[r]; j++) { \ 153 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 154 p += opt->dx[r]*MBS; \ 155 } \ 156 } \ 157 } else { \ 158 for (i=0; i<count; i++) \ 159 for (j=0; j<M; j++) \ 160 for (k=0; k<BS; k++) \ 161 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 162 } \ 163 PetscFunctionReturn(0); \ 164 } 165 166 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 167 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 168 { \ 169 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 170 PetscInt i,j,k,r,l,bs=link->bs; \ 171 const PetscInt M = (EQ) ? 1 : bs/BS; \ 172 const PetscInt MBS = M*BS; \ 173 PetscFunctionBegin; \ 174 for (i=0; i<count; i++) { \ 175 r = (!idx ? start+i : idx[i])*MBS; \ 176 l = i*MBS; \ 177 for (j=0; j<M; j++) \ 178 for (k=0; k<BS; k++) { \ 179 tmp = u[r+j*BS+k]; \ 180 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 181 p[l+j*BS+k] = tmp; \ 182 } \ 183 } \ 184 PetscFunctionReturn(0); \ 185 } 186 187 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 188 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 189 { \ 190 PetscErrorCode ierr; \ 191 const Type *u = (const Type*)src; \ 192 Type *v = (Type*)dst; \ 193 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 194 const PetscInt M = (EQ) ? 1 : bs/BS; \ 195 const PetscInt MBS = M*BS; \ 196 PetscFunctionBegin; \ 197 if (!srcIdx) { /* src is contiguous */ \ 198 u += srcStart*MBS; \ 199 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 200 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 201 u += srcOpt->start[0]*MBS; \ 202 v += dstStart*MBS; \ 203 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 204 for (k=0; k<srcOpt->dz[0]; k++) \ 205 for (j=0; j<srcOpt->dy[0]; j++) { \ 206 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 207 v += srcOpt->dx[0]*MBS; \ 208 } \ 209 } else { /* all other cases */ \ 210 for (i=0; i<count; i++) { \ 211 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 212 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 213 for (j=0; j<M; j++) \ 214 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 215 } \ 216 } \ 217 PetscFunctionReturn(0); \ 218 } 219 220 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 221 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 222 { \ 223 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 224 const Type *ldata = (const Type*)leafdata; \ 225 PetscInt i,j,k,r,l,bs = link->bs; \ 226 const PetscInt M = (EQ) ? 1 : bs/BS; \ 227 const PetscInt MBS = M*BS; \ 228 PetscFunctionBegin; \ 229 for (i=0; i<count; i++) { \ 230 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 231 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 232 for (j=0; j<M; j++) \ 233 for (k=0; k<BS; k++) { \ 234 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 235 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 236 } \ 237 } \ 238 PetscFunctionReturn(0); \ 239 } 240 241 /* Pack, Unpack/Fetch ops */ 242 #define DEF_Pack(Type,BS,EQ) \ 243 DEF_PackFunc(Type,BS,EQ) \ 244 DEF_UnpackFunc(Type,BS,EQ) \ 245 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 246 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 247 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 248 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 249 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 250 } 251 252 /* Add, Mult ops */ 253 #define DEF_Add(Type,BS,EQ) \ 254 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 255 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 256 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 257 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 258 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 259 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 260 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 261 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 262 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 263 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 264 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 265 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 266 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 267 } 268 269 /* Max, Min ops */ 270 #define DEF_Cmp(Type,BS,EQ) \ 271 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 272 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 273 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 274 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 275 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 276 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 277 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 278 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 279 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 280 } 281 282 /* Logical ops. 283 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 284 the compilation warning "empty macro arguments are undefined in ISO C90" 285 */ 286 #define DEF_Log(Type,BS,EQ) \ 287 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 288 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 289 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 290 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 291 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 292 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 293 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 294 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 295 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 296 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 297 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 298 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 299 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 300 } 301 302 /* Bitwise ops */ 303 #define DEF_Bit(Type,BS,EQ) \ 304 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 305 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 306 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 307 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 308 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 309 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 310 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 311 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 312 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 313 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 314 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 315 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 316 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 317 } 318 319 /* Maxloc, Minloc ops */ 320 #define DEF_Xloc(Type,BS,EQ) \ 321 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 322 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 323 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 324 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 325 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 326 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 327 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 328 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 329 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 330 } 331 332 #define DEF_IntegerType(Type,BS,EQ) \ 333 DEF_Pack(Type,BS,EQ) \ 334 DEF_Add(Type,BS,EQ) \ 335 DEF_Cmp(Type,BS,EQ) \ 336 DEF_Log(Type,BS,EQ) \ 337 DEF_Bit(Type,BS,EQ) \ 338 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 339 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 340 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 341 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 342 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 343 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 344 } 345 346 #define DEF_RealType(Type,BS,EQ) \ 347 DEF_Pack(Type,BS,EQ) \ 348 DEF_Add(Type,BS,EQ) \ 349 DEF_Cmp(Type,BS,EQ) \ 350 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 351 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 352 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 353 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 354 } 355 356 #if defined(PETSC_HAVE_COMPLEX) 357 #define DEF_ComplexType(Type,BS,EQ) \ 358 DEF_Pack(Type,BS,EQ) \ 359 DEF_Add(Type,BS,EQ) \ 360 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 361 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 362 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 363 } 364 #endif 365 366 #define DEF_DumbType(Type,BS,EQ) \ 367 DEF_Pack(Type,BS,EQ) \ 368 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 369 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 370 } 371 372 /* Maxloc, Minloc */ 373 #define DEF_PairType(Type,BS,EQ) \ 374 DEF_Pack(Type,BS,EQ) \ 375 DEF_Xloc(Type,BS,EQ) \ 376 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 377 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 378 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 379 } 380 381 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 382 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 383 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 384 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 385 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 386 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 387 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 388 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 389 390 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 391 DEF_IntegerType(int,1,1) 392 DEF_IntegerType(int,2,1) 393 DEF_IntegerType(int,4,1) 394 DEF_IntegerType(int,8,1) 395 DEF_IntegerType(int,1,0) 396 DEF_IntegerType(int,2,0) 397 DEF_IntegerType(int,4,0) 398 DEF_IntegerType(int,8,0) 399 #endif 400 401 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 402 typedef signed char SignedChar; 403 DEF_IntegerType(SignedChar,1,1) 404 DEF_IntegerType(SignedChar,2,1) 405 DEF_IntegerType(SignedChar,4,1) 406 DEF_IntegerType(SignedChar,8,1) 407 DEF_IntegerType(SignedChar,1,0) 408 DEF_IntegerType(SignedChar,2,0) 409 DEF_IntegerType(SignedChar,4,0) 410 DEF_IntegerType(SignedChar,8,0) 411 412 typedef unsigned char UnsignedChar; 413 DEF_IntegerType(UnsignedChar,1,1) 414 DEF_IntegerType(UnsignedChar,2,1) 415 DEF_IntegerType(UnsignedChar,4,1) 416 DEF_IntegerType(UnsignedChar,8,1) 417 DEF_IntegerType(UnsignedChar,1,0) 418 DEF_IntegerType(UnsignedChar,2,0) 419 DEF_IntegerType(UnsignedChar,4,0) 420 DEF_IntegerType(UnsignedChar,8,0) 421 422 DEF_RealType(PetscReal,1,1) 423 DEF_RealType(PetscReal,2,1) 424 DEF_RealType(PetscReal,4,1) 425 DEF_RealType(PetscReal,8,1) 426 DEF_RealType(PetscReal,1,0) 427 DEF_RealType(PetscReal,2,0) 428 DEF_RealType(PetscReal,4,0) 429 DEF_RealType(PetscReal,8,0) 430 431 #if defined(PETSC_HAVE_COMPLEX) 432 DEF_ComplexType(PetscComplex,1,1) 433 DEF_ComplexType(PetscComplex,2,1) 434 DEF_ComplexType(PetscComplex,4,1) 435 DEF_ComplexType(PetscComplex,8,1) 436 DEF_ComplexType(PetscComplex,1,0) 437 DEF_ComplexType(PetscComplex,2,0) 438 DEF_ComplexType(PetscComplex,4,0) 439 DEF_ComplexType(PetscComplex,8,0) 440 #endif 441 442 #define PairType(Type1,Type2) Type1##_##Type2 443 typedef struct {int u; int i;} PairType(int,int); 444 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 445 DEF_PairType(PairType(int,int),1,1) 446 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 447 448 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 449 DEF_DumbType(char,1,1) 450 DEF_DumbType(char,2,1) 451 DEF_DumbType(char,4,1) 452 DEF_DumbType(char,1,0) 453 DEF_DumbType(char,2,0) 454 DEF_DumbType(char,4,0) 455 456 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 457 DEF_DumbType(DumbInt,1,1) 458 DEF_DumbType(DumbInt,2,1) 459 DEF_DumbType(DumbInt,4,1) 460 DEF_DumbType(DumbInt,8,1) 461 DEF_DumbType(DumbInt,1,0) 462 DEF_DumbType(DumbInt,2,0) 463 DEF_DumbType(DumbInt,4,0) 464 DEF_DumbType(DumbInt,8,0) 465 466 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 467 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 468 { 469 int ierr; 470 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 471 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 472 return MPI_SUCCESS; 473 } 474 #endif 475 476 /* 477 The routine Creates a communication link for the given operation. It first looks up its link cache. If 478 there is a free & suitable one, it uses it. Otherwise it creates a new one. 479 480 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 481 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 482 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 483 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 484 485 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 486 487 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 488 489 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 490 need pack/unpack data. 491 */ 492 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 493 { 494 PetscErrorCode ierr; 495 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 496 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 497 PetscSFLink *p,link; 498 PetscSFDirection direction; 499 MPI_Request *reqs = NULL; 500 PetscBool match,rootdirect[2],leafdirect[2]; 501 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/ 502 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 503 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 504 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 505 506 PetscFunctionBegin; 507 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 508 509 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 510 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 511 if (sfop == PETSCSF_BCAST) { 512 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 513 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 514 } else if (sfop == PETSCSF_REDUCE) { 515 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 516 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 517 } else { /* PETSCSF_FETCH */ 518 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 519 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 520 } 521 } 522 523 if (sf->use_gpu_aware_mpi) { 524 rootmtype_mpi = rootmtype; 525 leafmtype_mpi = leafmtype; 526 } else { 527 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 528 } 529 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 530 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 531 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 532 533 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 534 nrootreqs = bas->nrootreqs; 535 nleafreqs = sf->nleafreqs; 536 537 /* Look for free links in cache */ 538 for (p=&bas->avail; (link=*p); p=&link->next) { 539 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 540 if (match) { 541 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 542 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 543 */ 544 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 545 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 546 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 547 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 548 } 549 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 550 reqs = link->leafreqs[direction][leafmtype][1]; 551 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 552 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 553 } 554 *p = link->next; /* Remove from available list */ 555 goto found; 556 } 557 } 558 559 ierr = PetscNew(&link);CHKERRQ(ierr); 560 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 561 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 562 563 nreqs = (nrootreqs+nleafreqs)*8; 564 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 565 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 566 567 for (i=0; i<2; i++) { /* Two communication directions */ 568 for (j=0; j<2; j++) { /* Two memory types */ 569 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 570 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 571 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 572 } 573 } 574 } 575 576 found: 577 578 #if defined(PETSC_HAVE_DEVICE) 579 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 580 #if defined(PETSC_HAVE_CUDA) 581 if (sf->backend == PETSCSF_BACKEND_CUDA) {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);} 582 #endif 583 #if defined(PETSC_HAVE_KOKKOS) 584 if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);} 585 #endif 586 } 587 #endif 588 589 /* Allocate buffers along root/leafdata */ 590 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 591 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 592 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 593 if (bas->rootbuflen[i]) { 594 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 595 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 596 } else { /* Have to have a separate rootbuf */ 597 if (!link->rootbuf_alloc[i][rootmtype]) { 598 ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 599 } 600 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 601 } 602 } 603 604 if (sf->leafbuflen[i]) { 605 if (leafdirect[i]) { 606 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 607 } else { 608 if (!link->leafbuf_alloc[i][leafmtype]) { 609 ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 610 } 611 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 612 } 613 } 614 } 615 616 #if defined(PETSC_HAVE_DEVICE) 617 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 618 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 619 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 620 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 621 } 622 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 623 } 624 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 625 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 626 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 627 } 628 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 629 } 630 #endif 631 632 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 633 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 634 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 635 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 636 } 637 638 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 639 link->leafdata = leafdata; 640 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 641 link->rootdirect[i] = rootdirect[i]; 642 link->leafdirect[i] = leafdirect[i]; 643 } 644 link->rootdirect_mpi = rootdirect_mpi; 645 link->leafdirect_mpi = leafdirect_mpi; 646 link->rootmtype = rootmtype; 647 link->leafmtype = leafmtype; 648 link->rootmtype_mpi = rootmtype_mpi; 649 link->leafmtype_mpi = leafmtype_mpi; 650 651 link->next = bas->inuse; 652 bas->inuse = link; 653 *mylink = link; 654 PetscFunctionReturn(0); 655 } 656 657 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 658 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 659 */ 660 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 661 { 662 PetscErrorCode ierr; 663 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 664 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 665 const PetscInt *rootoffset,*leafoffset; 666 PetscMPIInt n; 667 MPI_Aint disp; 668 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 669 MPI_Datatype unit = link->unit; 670 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 671 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 672 673 PetscFunctionBegin; 674 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 675 if (sf->persistent) { 676 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 677 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 678 if (direction == PETSCSF_LEAF2ROOT) { 679 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 680 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 681 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 682 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 683 } 684 } else { /* PETSCSF_ROOT2LEAF */ 685 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 686 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 687 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 688 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 689 } 690 } 691 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 692 } 693 694 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 695 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 696 if (direction == PETSCSF_LEAF2ROOT) { 697 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 698 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 699 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 700 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 701 } 702 } else { /* PETSCSF_ROOT2LEAF */ 703 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 704 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 705 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 706 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 707 } 708 } 709 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 710 } 711 } 712 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 713 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 714 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 715 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 716 PetscFunctionReturn(0); 717 } 718 719 720 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 721 { 722 PetscErrorCode ierr; 723 PetscSFLink link,*p; 724 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 725 726 PetscFunctionBegin; 727 /* Look for types in cache */ 728 for (p=&bas->inuse; (link=*p); p=&link->next) { 729 PetscBool match; 730 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 731 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 732 switch (cmode) { 733 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 734 case PETSC_USE_POINTER: break; 735 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 736 } 737 *mylink = link; 738 PetscFunctionReturn(0); 739 } 740 } 741 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 742 } 743 744 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 745 { 746 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 747 748 PetscFunctionBegin; 749 (*link)->rootdata = NULL; 750 (*link)->leafdata = NULL; 751 (*link)->next = bas->avail; 752 bas->avail = *link; 753 *link = NULL; 754 PetscFunctionReturn(0); 755 } 756 757 /* Destroy all links chained in 'avail' */ 758 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 759 { 760 PetscErrorCode ierr; 761 PetscSFLink link = *avail,next; 762 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 763 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 764 765 PetscFunctionBegin; 766 for (; link; link=next) { 767 next = link->next; 768 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 769 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 770 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 771 } 772 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 773 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 774 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 775 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 776 #if defined(PETSC_HAVE_DEVICE) 777 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 778 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 779 #endif 780 } 781 #if defined(PETSC_HAVE_DEVICE) 782 if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);} 783 #if defined(PETSC_HAVE_CUDA) 784 if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;} 785 #elif defined(PETSC_HAVE_HIP) 786 if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */ 787 #endif 788 #endif 789 ierr = PetscFree(link);CHKERRQ(ierr); 790 } 791 *avail = NULL; 792 PetscFunctionReturn(0); 793 } 794 795 /* Error out on unsupported overlapped communications */ 796 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 797 { 798 PetscErrorCode ierr; 799 PetscSFLink link,*p; 800 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 801 PetscBool match; 802 803 PetscFunctionBegin; 804 if (PetscDefined(USE_DEBUG)) { 805 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 806 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 807 */ 808 if (rootdata || leafdata) { 809 for (p=&bas->inuse; (link=*p); p=&link->next) { 810 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 811 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 812 } 813 } 814 } 815 PetscFunctionReturn(0); 816 } 817 818 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 819 { 820 PetscFunctionBegin; 821 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 822 PetscFunctionReturn(0); 823 } 824 825 826 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 827 { 828 PetscErrorCode ierr; 829 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 830 PetscBool is2Int,is2PetscInt; 831 PetscMPIInt ni,na,nd,combiner; 832 #if defined(PETSC_HAVE_COMPLEX) 833 PetscInt nPetscComplex=0; 834 #endif 835 836 PetscFunctionBegin; 837 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 838 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 839 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 840 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 841 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 842 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 843 #if defined(PETSC_HAVE_COMPLEX) 844 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 845 #endif 846 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 847 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 848 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 849 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 850 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 851 link->bs = 1; /* default */ 852 853 if (is2Int) { 854 PackInit_PairType_int_int_1_1(link); 855 link->bs = 1; 856 link->unitbytes = 2*sizeof(int); 857 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 858 link->basicunit = MPI_2INT; 859 link->unit = MPI_2INT; 860 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 861 PackInit_PairType_PetscInt_PetscInt_1_1(link); 862 link->bs = 1; 863 link->unitbytes = 2*sizeof(PetscInt); 864 link->basicunit = MPIU_2INT; 865 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 866 link->unit = MPIU_2INT; 867 } else if (nPetscReal) { 868 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 869 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 870 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 871 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 872 link->bs = nPetscReal; 873 link->unitbytes = nPetscReal*sizeof(PetscReal); 874 link->basicunit = MPIU_REAL; 875 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 876 } else if (nPetscInt) { 877 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 878 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 879 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 880 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 881 link->bs = nPetscInt; 882 link->unitbytes = nPetscInt*sizeof(PetscInt); 883 link->basicunit = MPIU_INT; 884 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 885 #if defined(PETSC_USE_64BIT_INDICES) 886 } else if (nInt) { 887 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 888 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 889 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 890 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 891 link->bs = nInt; 892 link->unitbytes = nInt*sizeof(int); 893 link->basicunit = MPI_INT; 894 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 895 #endif 896 } else if (nSignedChar) { 897 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 898 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 899 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 900 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 901 link->bs = nSignedChar; 902 link->unitbytes = nSignedChar*sizeof(SignedChar); 903 link->basicunit = MPI_SIGNED_CHAR; 904 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 905 } else if (nUnsignedChar) { 906 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 907 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 908 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 909 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 910 link->bs = nUnsignedChar; 911 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 912 link->basicunit = MPI_UNSIGNED_CHAR; 913 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 914 #if defined(PETSC_HAVE_COMPLEX) 915 } else if (nPetscComplex) { 916 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 917 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 918 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 919 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 920 link->bs = nPetscComplex; 921 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 922 link->basicunit = MPIU_COMPLEX; 923 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 924 #endif 925 } else { 926 MPI_Aint lb,nbyte; 927 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 928 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 929 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 930 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 931 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 932 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 933 link->bs = nbyte; 934 link->unitbytes = nbyte; 935 link->basicunit = MPI_BYTE; 936 } else { 937 nInt = nbyte / sizeof(int); 938 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 939 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 940 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 941 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 942 link->bs = nInt; 943 link->unitbytes = nbyte; 944 link->basicunit = MPI_INT; 945 } 946 if (link->isbuiltin) link->unit = unit; 947 } 948 949 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 950 951 link->Memcpy = PetscSFLinkMemcpy_Host; 952 PetscFunctionReturn(0); 953 } 954 955 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 956 { 957 PetscFunctionBegin; 958 *UnpackAndOp = NULL; 959 if (mtype == PETSC_MEMTYPE_HOST) { 960 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 961 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 962 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 963 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 964 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 965 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 966 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 967 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 968 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 969 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 970 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 971 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 972 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 973 } 974 #if defined(PETSC_HAVE_DEVICE) 975 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 976 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 977 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 978 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 979 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 980 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 981 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 982 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 983 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 984 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 985 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 986 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 987 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 988 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 989 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 990 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 991 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 992 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 993 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 994 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 995 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 996 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 997 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 998 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 999 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1000 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1001 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1002 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1003 } 1004 #endif 1005 PetscFunctionReturn(0); 1006 } 1007 1008 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 1009 { 1010 PetscFunctionBegin; 1011 *ScatterAndOp = NULL; 1012 if (mtype == PETSC_MEMTYPE_HOST) { 1013 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1014 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1015 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1016 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1017 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1018 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1019 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1020 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1021 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1022 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1023 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1024 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1025 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1026 } 1027 #if defined(PETSC_HAVE_DEVICE) 1028 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1029 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1030 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1031 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1032 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1033 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1034 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1035 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1036 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1037 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1038 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1039 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1040 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1041 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1042 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1043 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1044 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1045 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1046 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1047 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1048 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1049 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1050 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1051 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1052 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1053 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1054 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1055 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1056 } 1057 #endif 1058 PetscFunctionReturn(0); 1059 } 1060 1061 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1062 { 1063 PetscFunctionBegin; 1064 *FetchAndOp = NULL; 1065 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1066 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1067 #if defined(PETSC_HAVE_DEVICE) 1068 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1069 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1070 #endif 1071 PetscFunctionReturn(0); 1072 } 1073 1074 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1075 { 1076 PetscFunctionBegin; 1077 *FetchAndOpLocal = NULL; 1078 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1079 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1080 #if defined(PETSC_HAVE_DEVICE) 1081 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1082 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1083 #endif 1084 PetscFunctionReturn(0); 1085 } 1086 1087 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1088 { 1089 PetscErrorCode ierr; 1090 PetscLogDouble flops; 1091 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1092 1093 1094 PetscFunctionBegin; 1095 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1096 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1097 #if defined(PETSC_HAVE_DEVICE) 1098 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1099 #endif 1100 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1101 } 1102 PetscFunctionReturn(0); 1103 } 1104 1105 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1106 { 1107 PetscLogDouble flops; 1108 PetscErrorCode ierr; 1109 1110 PetscFunctionBegin; 1111 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1112 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1113 #if defined(PETSC_HAVE_DEVICE) 1114 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1115 #endif 1116 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1117 } 1118 PetscFunctionReturn(0); 1119 } 1120 1121 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1122 Input Arguments: 1123 +sf - The StarForest 1124 .link - The link 1125 .count - Number of entries to unpack 1126 .start - The first index, significent when indices=NULL 1127 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1128 .buf - A contiguous buffer to unpack from 1129 -op - Operation after unpack 1130 1131 Output Arguments: 1132 .data - The data to unpack to 1133 */ 1134 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1135 { 1136 PetscFunctionBegin; 1137 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1138 { 1139 PetscErrorCode ierr; 1140 PetscInt i; 1141 PetscMPIInt n; 1142 if (indices) { 1143 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1144 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1145 */ 1146 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1147 } else { 1148 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1149 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1150 } 1151 } 1152 PetscFunctionReturn(0); 1153 #else 1154 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1155 #endif 1156 } 1157 1158 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1159 { 1160 PetscFunctionBegin; 1161 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1162 { 1163 PetscErrorCode ierr; 1164 PetscInt i,disp; 1165 if (!srcIdx) { 1166 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1167 } else { 1168 for (i=0; i<count; i++) { 1169 disp = dstIdx? dstIdx[i] : dstStart + i; 1170 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1171 } 1172 } 1173 } 1174 PetscFunctionReturn(0); 1175 #else 1176 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1177 #endif 1178 } 1179 1180 /*============================================================================= 1181 Pack/Unpack/Fetch/Scatter routines 1182 ============================================================================*/ 1183 1184 /* Pack rootdata to rootbuf 1185 Input Arguments: 1186 + sf - The SF this packing works on. 1187 . link - It gives the memtype of the roots and also provides root buffer. 1188 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1189 - rootdata - Where to read the roots. 1190 1191 Notes: 1192 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1193 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1194 */ 1195 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1196 { 1197 PetscErrorCode ierr; 1198 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1199 const PetscInt *rootindices = NULL; 1200 PetscInt count,start; 1201 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1202 PetscMemType rootmtype = link->rootmtype; 1203 PetscSFPackOpt opt = NULL; 1204 1205 PetscFunctionBegin; 1206 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1207 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1208 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1209 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1210 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1211 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1212 } 1213 if (scope == PETSCSF_REMOTE) { 1214 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1215 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1216 } 1217 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1218 PetscFunctionReturn(0); 1219 } 1220 1221 /* Pack leafdata to leafbuf */ 1222 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1223 { 1224 PetscErrorCode ierr; 1225 const PetscInt *leafindices = NULL; 1226 PetscInt count,start; 1227 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1228 PetscMemType leafmtype = link->leafmtype; 1229 PetscSFPackOpt opt = NULL; 1230 1231 PetscFunctionBegin; 1232 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1233 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1234 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1235 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1236 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1237 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1238 } 1239 if (scope == PETSCSF_REMOTE) { 1240 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1241 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1242 } 1243 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1244 PetscFunctionReturn(0); 1245 } 1246 1247 /* Unpack rootbuf to rootdata */ 1248 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1249 { 1250 PetscErrorCode ierr; 1251 const PetscInt *rootindices = NULL; 1252 PetscInt count,start; 1253 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1254 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1255 PetscMemType rootmtype = link->rootmtype; 1256 PetscSFPackOpt opt = NULL; 1257 1258 PetscFunctionBegin; 1259 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1260 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1261 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1262 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1263 if (UnpackAndOp) { 1264 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1265 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1266 } else { 1267 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1268 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1269 } 1270 } 1271 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1272 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1273 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1274 PetscFunctionReturn(0); 1275 } 1276 1277 /* Unpack leafbuf to leafdata */ 1278 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1279 { 1280 PetscErrorCode ierr; 1281 const PetscInt *leafindices = NULL; 1282 PetscInt count,start; 1283 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1284 PetscMemType leafmtype = link->leafmtype; 1285 PetscSFPackOpt opt = NULL; 1286 1287 PetscFunctionBegin; 1288 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1289 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1290 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1291 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1292 if (UnpackAndOp) { 1293 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1294 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1295 } else { 1296 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1297 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1298 } 1299 } 1300 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1301 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1302 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1303 PetscFunctionReturn(0); 1304 } 1305 1306 /* FetchAndOp rootdata with rootbuf */ 1307 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1308 { 1309 PetscErrorCode ierr; 1310 const PetscInt *rootindices = NULL; 1311 PetscInt count,start; 1312 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1313 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1314 PetscMemType rootmtype = link->rootmtype; 1315 PetscSFPackOpt opt = NULL; 1316 1317 PetscFunctionBegin; 1318 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1319 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1320 if (bas->rootbuflen[scope]) { 1321 /* Do FetchAndOp on rootdata with rootbuf */ 1322 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1323 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1324 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1325 } 1326 if (scope == PETSCSF_REMOTE) { 1327 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1328 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1329 } 1330 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1331 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1332 PetscFunctionReturn(0); 1333 } 1334 1335 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1336 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1337 { 1338 PetscErrorCode ierr; 1339 const PetscInt *rootindices = NULL,*leafindices = NULL; 1340 PetscInt count,rootstart,leafstart; 1341 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1342 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1343 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1344 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1345 1346 PetscFunctionBegin; 1347 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1348 if (rootmtype != leafmtype) { /* Uncommon case */ 1349 /* The local communication has to go through pack and unpack */ 1350 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1351 ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1352 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1353 } else { 1354 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1355 if (ScatterAndOp) { 1356 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1357 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1358 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1359 } else { 1360 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1361 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1362 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1363 } 1364 } 1365 PetscFunctionReturn(0); 1366 } 1367 1368 /* Reduce leafdata to rootdata locally */ 1369 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1370 { 1371 PetscErrorCode ierr; 1372 const PetscInt *rootindices = NULL,*leafindices = NULL; 1373 PetscInt count,rootstart,leafstart; 1374 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1375 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1376 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1377 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1378 1379 PetscFunctionBegin; 1380 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1381 if (rootmtype != leafmtype) { 1382 /* The local communication has to go through pack and unpack */ 1383 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1384 ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1385 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1386 } else { 1387 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1388 if (ScatterAndOp) { 1389 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1390 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1391 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1392 } else { 1393 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1394 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1395 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1396 } 1397 } 1398 PetscFunctionReturn(0); 1399 } 1400 1401 /* Fetch rootdata to leafdata and leafupdate locally */ 1402 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1403 { 1404 PetscErrorCode ierr; 1405 const PetscInt *rootindices = NULL,*leafindices = NULL; 1406 PetscInt count,rootstart,leafstart; 1407 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1408 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1409 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1410 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1411 1412 PetscFunctionBegin; 1413 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1414 if (rootmtype != leafmtype) { 1415 /* The local communication has to go through pack and unpack */ 1416 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1417 } else { 1418 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1419 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1420 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1421 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1422 } 1423 PetscFunctionReturn(0); 1424 } 1425 1426 /* 1427 Create per-rank pack/unpack optimizations based on indice patterns 1428 1429 Input Parameters: 1430 + n - Number of destination ranks 1431 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1432 - idx - [*] Array storing indices 1433 1434 Output Parameters: 1435 + opt - Pack optimizations. NULL if no optimizations. 1436 */ 1437 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1438 { 1439 PetscErrorCode ierr; 1440 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1441 PetscBool optimizable = PETSC_TRUE; 1442 PetscSFPackOpt opt; 1443 1444 PetscFunctionBegin; 1445 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1446 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1447 opt->n = opt->array[0] = n; 1448 opt->offset = opt->array + 1; 1449 opt->start = opt->array + n + 2; 1450 opt->dx = opt->array + 2*n + 2; 1451 opt->dy = opt->array + 3*n + 2; 1452 opt->dz = opt->array + 4*n + 2; 1453 opt->X = opt->array + 5*n + 2; 1454 opt->Y = opt->array + 6*n + 2; 1455 1456 for (r=0; r<n; r++) { /* For each destination rank */ 1457 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1458 p = offset[r]; 1459 start = idx[p]; /* First index for this rank */ 1460 p++; 1461 1462 /* Search in X dimension */ 1463 for (dx=1; dx<m; dx++,p++) { 1464 if (start+dx != idx[p]) break; 1465 } 1466 1467 dydz = m/dx; 1468 X = dydz > 1 ? (idx[p]-start) : dx; 1469 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1470 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1471 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1472 for (i=0; i<dx; i++,p++) { 1473 if (start+X*dy+i != idx[p]) { 1474 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1475 else goto Z_dimension; 1476 } 1477 } 1478 } 1479 1480 Z_dimension: 1481 dz = m/(dx*dy); 1482 Y = dz > 1 ? (idx[p]-start)/X : dy; 1483 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1484 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1485 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1486 for (j=0; j<dy; j++) { 1487 for (i=0; i<dx; i++,p++) { 1488 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1489 } 1490 } 1491 } 1492 opt->start[r] = start; 1493 opt->dx[r] = dx; 1494 opt->dy[r] = dy; 1495 opt->dz[r] = dz; 1496 opt->X[r] = X; 1497 opt->Y[r] = Y; 1498 } 1499 1500 finish: 1501 /* If not optimizable, free arrays to save memory */ 1502 if (!n || !optimizable) { 1503 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1504 ierr = PetscFree(opt);CHKERRQ(ierr); 1505 *out = NULL; 1506 } else { 1507 opt->offset[0] = 0; 1508 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1509 *out = opt; 1510 } 1511 PetscFunctionReturn(0); 1512 } 1513 1514 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1515 { 1516 PetscErrorCode ierr; 1517 PetscSFPackOpt opt = *out; 1518 1519 PetscFunctionBegin; 1520 if (opt) { 1521 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1522 ierr = PetscFree(opt);CHKERRQ(ierr); 1523 *out = NULL; 1524 } 1525 PetscFunctionReturn(0); 1526 } 1527 1528 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1529 { 1530 PetscErrorCode ierr; 1531 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1532 PetscInt i,j; 1533 1534 PetscFunctionBegin; 1535 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1536 for (i=0; i<2; i++) { /* Set defaults */ 1537 sf->leafstart[i] = 0; 1538 sf->leafcontig[i] = PETSC_TRUE; 1539 sf->leafdups[i] = PETSC_FALSE; 1540 bas->rootstart[i] = 0; 1541 bas->rootcontig[i] = PETSC_TRUE; 1542 bas->rootdups[i] = PETSC_FALSE; 1543 } 1544 1545 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1546 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1547 1548 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1549 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1550 1551 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1552 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1553 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1554 } 1555 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1556 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1557 } 1558 1559 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1560 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1561 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1562 1563 /* Are root indices for self and remote contiguous? */ 1564 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1565 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1566 1567 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1568 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1569 1570 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1571 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1572 } 1573 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1574 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1575 } 1576 1577 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1578 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1579 1580 #if defined(PETSC_HAVE_DEVICE) 1581 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1582 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1583 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1584 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1585 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1586 #endif 1587 1588 PetscFunctionReturn(0); 1589 } 1590 1591 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1592 { 1593 PetscErrorCode ierr; 1594 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1595 PetscInt i; 1596 1597 PetscFunctionBegin; 1598 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1599 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1600 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1601 #if defined(PETSC_HAVE_DEVICE) 1602 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1603 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1604 #endif 1605 } 1606 PetscFunctionReturn(0); 1607 } 1608