1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 #if defined(PETSC_HAVE_CUDA) 8 #include <cuda_runtime.h> 9 #include <petsccublas.h> 10 #endif 11 #if defined(PETSC_HAVE_HIP) 12 #include <hip/hip_runtime.h> 13 #include <petschipblas.h> 14 #endif 15 /* 16 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 17 * therefore we pack data types manually. This file defines packing routines for the standard data types. 18 */ 19 20 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 21 22 /* Operations working like s += t */ 23 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 25 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 26 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 27 /* Ref MPI MAXLOC */ 28 #define OP_XLOC(op,s,t) \ 29 do { \ 30 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 31 else if (!((s).u op (t).u)) s = t; \ 32 } while (0) 33 34 /* DEF_PackFunc - macro defining a Pack routine 35 36 Arguments of the macro: 37 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 38 .BS Block size for vectorization. It is a factor of bsz. 39 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 40 41 Arguments of the Pack routine: 42 +count Number of indices in idx[]. 43 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 44 .opt Per-pack optimization plan. NULL means no such plan. 45 .idx Indices of entries to packed. 46 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 47 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 48 -packed Address of the packed data. 49 */ 50 #define DEF_PackFunc(Type,BS,EQ) \ 51 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 52 { \ 53 PetscErrorCode ierr; \ 54 const Type *u = (const Type*)unpacked,*u2; \ 55 Type *p = (Type*)packed,*p2; \ 56 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 57 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 58 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 59 PetscFunctionBegin; \ 60 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 61 else if (opt) { /* has optimizations available */ \ 62 p2 = p; \ 63 for (r=0; r<opt->n; r++) { \ 64 u2 = u + opt->start[r]*MBS; \ 65 X = opt->X[r]; \ 66 Y = opt->Y[r]; \ 67 for (k=0; k<opt->dz[r]; k++) \ 68 for (j=0; j<opt->dy[r]; j++) { \ 69 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 70 p2 += opt->dx[r]*MBS; \ 71 } \ 72 } \ 73 } else { \ 74 for (i=0; i<count; i++) \ 75 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 76 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 77 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 78 } \ 79 PetscFunctionReturn(0); \ 80 } 81 82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 83 and inserts into a sparse array. 84 85 Arguments: 86 .Type Type of the data 87 .BS Block size for vectorization 88 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 89 90 Notes: 91 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 92 */ 93 #define DEF_UnpackFunc(Type,BS,EQ) \ 94 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 95 { \ 96 PetscErrorCode ierr; \ 97 Type *u = (Type*)unpacked,*u2; \ 98 const Type *p = (const Type*)packed; \ 99 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 100 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 101 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 102 PetscFunctionBegin; \ 103 if (!idx) { \ 104 u += start*MBS; \ 105 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 106 } else if (opt) { /* has optimizations available */ \ 107 for (r=0; r<opt->n; r++) { \ 108 u2 = u + opt->start[r]*MBS; \ 109 X = opt->X[r]; \ 110 Y = opt->Y[r]; \ 111 for (k=0; k<opt->dz[r]; k++) \ 112 for (j=0; j<opt->dy[r]; j++) { \ 113 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 114 p += opt->dx[r]*MBS; \ 115 } \ 116 } \ 117 } else { \ 118 for (i=0; i<count; i++) \ 119 for (j=0; j<M; j++) \ 120 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 121 } \ 122 PetscFunctionReturn(0); \ 123 } 124 125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 126 127 Arguments: 128 +Opname Name of the Op, such as Add, Mult, LAND, etc. 129 .Type Type of the data 130 .BS Block size for vectorization 131 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 132 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 133 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 134 */ 135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 136 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 137 { \ 138 Type *u = (Type*)unpacked,*u2; \ 139 const Type *p = (const Type*)packed; \ 140 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 141 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 142 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 143 PetscFunctionBegin; \ 144 if (!idx) { \ 145 u += start*MBS; \ 146 for (i=0; i<count; i++) \ 147 for (j=0; j<M; j++) \ 148 for (k=0; k<BS; k++) \ 149 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r=0; r<opt->n; r++) { \ 152 u2 = u + opt->start[r]*MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k=0; k<opt->dz[r]; k++) \ 156 for (j=0; j<opt->dy[r]; j++) { \ 157 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 158 p += opt->dx[r]*MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i=0; i<count; i++) \ 163 for (j=0; j<M; j++) \ 164 for (k=0; k<BS; k++) \ 165 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 171 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 172 { \ 173 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 174 PetscInt i,j,k,r,l,bs=link->bs; \ 175 const PetscInt M = (EQ) ? 1 : bs/BS; \ 176 const PetscInt MBS = M*BS; \ 177 PetscFunctionBegin; \ 178 for (i=0; i<count; i++) { \ 179 r = (!idx ? start+i : idx[i])*MBS; \ 180 l = i*MBS; \ 181 for (j=0; j<M; j++) \ 182 for (k=0; k<BS; k++) { \ 183 tmp = u[r+j*BS+k]; \ 184 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 185 p[l+j*BS+k] = tmp; \ 186 } \ 187 } \ 188 PetscFunctionReturn(0); \ 189 } 190 191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 192 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 193 { \ 194 PetscErrorCode ierr; \ 195 const Type *u = (const Type*)src; \ 196 Type *v = (Type*)dst; \ 197 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 198 const PetscInt M = (EQ) ? 1 : bs/BS; \ 199 const PetscInt MBS = M*BS; \ 200 PetscFunctionBegin; \ 201 if (!srcIdx) { /* src is contiguous */ \ 202 u += srcStart*MBS; \ 203 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 204 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 205 u += srcOpt->start[0]*MBS; \ 206 v += dstStart*MBS; \ 207 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 208 for (k=0; k<srcOpt->dz[0]; k++) \ 209 for (j=0; j<srcOpt->dy[0]; j++) { \ 210 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 211 v += srcOpt->dx[0]*MBS; \ 212 } \ 213 } else { /* all other cases */ \ 214 for (i=0; i<count; i++) { \ 215 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 216 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 217 for (j=0; j<M; j++) \ 218 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 219 } \ 220 } \ 221 PetscFunctionReturn(0); \ 222 } 223 224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 225 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 226 { \ 227 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 228 const Type *ldata = (const Type*)leafdata; \ 229 PetscInt i,j,k,r,l,bs = link->bs; \ 230 const PetscInt M = (EQ) ? 1 : bs/BS; \ 231 const PetscInt MBS = M*BS; \ 232 PetscFunctionBegin; \ 233 for (i=0; i<count; i++) { \ 234 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 235 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 239 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 240 } \ 241 } \ 242 PetscFunctionReturn(0); \ 243 } 244 245 /* Pack, Unpack/Fetch ops */ 246 #define DEF_Pack(Type,BS,EQ) \ 247 DEF_PackFunc(Type,BS,EQ) \ 248 DEF_UnpackFunc(Type,BS,EQ) \ 249 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 250 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 251 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 252 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 253 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 254 } 255 256 /* Add, Mult ops */ 257 #define DEF_Add(Type,BS,EQ) \ 258 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 259 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 260 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 261 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 262 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 263 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 264 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 265 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 266 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 267 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 268 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 269 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 270 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 271 } 272 273 /* Max, Min ops */ 274 #define DEF_Cmp(Type,BS,EQ) \ 275 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 276 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 277 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 278 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 279 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 280 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 281 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 282 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 283 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 284 } 285 286 /* Logical ops. 287 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 288 the compilation warning "empty macro arguments are undefined in ISO C90" 289 */ 290 #define DEF_Log(Type,BS,EQ) \ 291 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 292 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 293 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 294 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 295 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 296 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 297 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 298 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 299 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 300 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 301 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 302 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 303 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 304 } 305 306 /* Bitwise ops */ 307 #define DEF_Bit(Type,BS,EQ) \ 308 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 309 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 310 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 311 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 312 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 313 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 314 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 315 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 316 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 317 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 318 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 319 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 320 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 321 } 322 323 /* Maxloc, Minloc ops */ 324 #define DEF_Xloc(Type,BS,EQ) \ 325 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 326 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 327 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 328 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 329 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 330 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 331 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 332 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 333 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 334 } 335 336 #define DEF_IntegerType(Type,BS,EQ) \ 337 DEF_Pack(Type,BS,EQ) \ 338 DEF_Add(Type,BS,EQ) \ 339 DEF_Cmp(Type,BS,EQ) \ 340 DEF_Log(Type,BS,EQ) \ 341 DEF_Bit(Type,BS,EQ) \ 342 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 343 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 344 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 345 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 346 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 347 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 348 } 349 350 #define DEF_RealType(Type,BS,EQ) \ 351 DEF_Pack(Type,BS,EQ) \ 352 DEF_Add(Type,BS,EQ) \ 353 DEF_Cmp(Type,BS,EQ) \ 354 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 355 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 356 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 357 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 358 } 359 360 #if defined(PETSC_HAVE_COMPLEX) 361 #define DEF_ComplexType(Type,BS,EQ) \ 362 DEF_Pack(Type,BS,EQ) \ 363 DEF_Add(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 367 } 368 #endif 369 370 #define DEF_DumbType(Type,BS,EQ) \ 371 DEF_Pack(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 } 375 376 /* Maxloc, Minloc */ 377 #define DEF_PairType(Type,BS,EQ) \ 378 DEF_Pack(Type,BS,EQ) \ 379 DEF_Xloc(Type,BS,EQ) \ 380 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 381 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 382 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 383 } 384 385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 393 394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 395 DEF_IntegerType(int,1,1) 396 DEF_IntegerType(int,2,1) 397 DEF_IntegerType(int,4,1) 398 DEF_IntegerType(int,8,1) 399 DEF_IntegerType(int,1,0) 400 DEF_IntegerType(int,2,0) 401 DEF_IntegerType(int,4,0) 402 DEF_IntegerType(int,8,0) 403 #endif 404 405 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 406 typedef signed char SignedChar; 407 DEF_IntegerType(SignedChar,1,1) 408 DEF_IntegerType(SignedChar,2,1) 409 DEF_IntegerType(SignedChar,4,1) 410 DEF_IntegerType(SignedChar,8,1) 411 DEF_IntegerType(SignedChar,1,0) 412 DEF_IntegerType(SignedChar,2,0) 413 DEF_IntegerType(SignedChar,4,0) 414 DEF_IntegerType(SignedChar,8,0) 415 416 typedef unsigned char UnsignedChar; 417 DEF_IntegerType(UnsignedChar,1,1) 418 DEF_IntegerType(UnsignedChar,2,1) 419 DEF_IntegerType(UnsignedChar,4,1) 420 DEF_IntegerType(UnsignedChar,8,1) 421 DEF_IntegerType(UnsignedChar,1,0) 422 DEF_IntegerType(UnsignedChar,2,0) 423 DEF_IntegerType(UnsignedChar,4,0) 424 DEF_IntegerType(UnsignedChar,8,0) 425 426 DEF_RealType(PetscReal,1,1) 427 DEF_RealType(PetscReal,2,1) 428 DEF_RealType(PetscReal,4,1) 429 DEF_RealType(PetscReal,8,1) 430 DEF_RealType(PetscReal,1,0) 431 DEF_RealType(PetscReal,2,0) 432 DEF_RealType(PetscReal,4,0) 433 DEF_RealType(PetscReal,8,0) 434 435 #if defined(PETSC_HAVE_COMPLEX) 436 DEF_ComplexType(PetscComplex,1,1) 437 DEF_ComplexType(PetscComplex,2,1) 438 DEF_ComplexType(PetscComplex,4,1) 439 DEF_ComplexType(PetscComplex,8,1) 440 DEF_ComplexType(PetscComplex,1,0) 441 DEF_ComplexType(PetscComplex,2,0) 442 DEF_ComplexType(PetscComplex,4,0) 443 DEF_ComplexType(PetscComplex,8,0) 444 #endif 445 446 #define PairType(Type1,Type2) Type1##_##Type2 447 typedef struct {int u; int i;} PairType(int,int); 448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 449 DEF_PairType(PairType(int,int),1,1) 450 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 451 452 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 453 DEF_DumbType(char,1,1) 454 DEF_DumbType(char,2,1) 455 DEF_DumbType(char,4,1) 456 DEF_DumbType(char,1,0) 457 DEF_DumbType(char,2,0) 458 DEF_DumbType(char,4,0) 459 460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 461 DEF_DumbType(DumbInt,1,1) 462 DEF_DumbType(DumbInt,2,1) 463 DEF_DumbType(DumbInt,4,1) 464 DEF_DumbType(DumbInt,8,1) 465 DEF_DumbType(DumbInt,1,0) 466 DEF_DumbType(DumbInt,2,0) 467 DEF_DumbType(DumbInt,4,0) 468 DEF_DumbType(DumbInt,8,0) 469 470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 472 { 473 int ierr; 474 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 475 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 476 return MPI_SUCCESS; 477 } 478 #endif 479 480 /* 481 The routine Creates a communication link for the given operation. It first looks up its link cache. If 482 there is a free & suitable one, it uses it. Otherwise it creates a new one. 483 484 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 485 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 486 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 487 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 488 489 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 490 491 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 492 493 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 494 need pack/unpack data. 495 */ 496 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 497 { 498 PetscErrorCode ierr; 499 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 500 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 501 PetscSFLink *p,link; 502 PetscSFDirection direction; 503 MPI_Request *reqs = NULL; 504 PetscBool match,rootdirect[2],leafdirect[2]; 505 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/ 506 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 507 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 508 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 509 510 PetscFunctionBegin; 511 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 512 513 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 514 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 515 if (sfop == PETSCSF_BCAST) { 516 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 517 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 518 } else if (sfop == PETSCSF_REDUCE) { 519 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 520 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 521 } else { /* PETSCSF_FETCH */ 522 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 523 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 524 } 525 } 526 527 if (sf->use_gpu_aware_mpi) { 528 rootmtype_mpi = rootmtype; 529 leafmtype_mpi = leafmtype; 530 } else { 531 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 532 } 533 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 534 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 535 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 536 537 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 538 nrootreqs = bas->nrootreqs; 539 nleafreqs = sf->nleafreqs; 540 541 /* Look for free links in cache */ 542 for (p=&bas->avail; (link=*p); p=&link->next) { 543 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 544 if (match) { 545 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 546 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 547 */ 548 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 549 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 550 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}} 551 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 552 } 553 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 554 reqs = link->leafreqs[direction][leafmtype][1]; 555 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}} 556 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 557 } 558 *p = link->next; /* Remove from available list */ 559 goto found; 560 } 561 } 562 563 ierr = PetscNew(&link);CHKERRQ(ierr); 564 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 565 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 566 567 nreqs = (nrootreqs+nleafreqs)*8; 568 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 569 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 570 571 for (i=0; i<2; i++) { /* Two communication directions */ 572 for (j=0; j<2; j++) { /* Two memory types */ 573 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 574 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 575 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 576 } 577 } 578 } 579 580 found: 581 582 #if defined(PETSC_HAVE_DEVICE) 583 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 584 #if defined(PETSC_HAVE_CUDA) 585 if (sf->backend == PETSCSF_BACKEND_CUDA) {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);} 586 #endif 587 #if defined(PETSC_HAVE_HIP) 588 if (sf->backend == PETSCSF_BACKEND_HIP) {ierr = PetscSFLinkSetUp_Hip(sf,link,unit);CHKERRQ(ierr);} 589 #endif 590 #if defined(PETSC_HAVE_KOKKOS) 591 if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);} 592 #endif 593 } 594 #endif 595 596 /* Allocate buffers along root/leafdata */ 597 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 598 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 599 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 600 if (bas->rootbuflen[i]) { 601 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 602 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 603 } else { /* Have to have a separate rootbuf */ 604 if (!link->rootbuf_alloc[i][rootmtype]) { 605 ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 606 } 607 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 608 } 609 } 610 611 if (sf->leafbuflen[i]) { 612 if (leafdirect[i]) { 613 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 614 } else { 615 if (!link->leafbuf_alloc[i][leafmtype]) { 616 ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 617 } 618 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 619 } 620 } 621 } 622 623 #if defined(PETSC_HAVE_DEVICE) 624 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 625 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 626 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 627 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 628 } 629 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 630 } 631 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 632 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 633 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 634 } 635 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 636 } 637 #endif 638 639 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 640 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 641 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 642 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 643 } 644 645 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 646 link->leafdata = leafdata; 647 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 648 link->rootdirect[i] = rootdirect[i]; 649 link->leafdirect[i] = leafdirect[i]; 650 } 651 link->rootdirect_mpi = rootdirect_mpi; 652 link->leafdirect_mpi = leafdirect_mpi; 653 link->rootmtype = rootmtype; 654 link->leafmtype = leafmtype; 655 link->rootmtype_mpi = rootmtype_mpi; 656 link->leafmtype_mpi = leafmtype_mpi; 657 658 link->next = bas->inuse; 659 bas->inuse = link; 660 *mylink = link; 661 PetscFunctionReturn(0); 662 } 663 664 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 665 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 666 */ 667 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 668 { 669 PetscErrorCode ierr; 670 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 671 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 672 const PetscInt *rootoffset,*leafoffset; 673 PetscMPIInt n; 674 MPI_Aint disp; 675 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 676 MPI_Datatype unit = link->unit; 677 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 678 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 679 680 PetscFunctionBegin; 681 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 682 if (sf->persistent) { 683 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 684 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 685 if (direction == PETSCSF_LEAF2ROOT) { 686 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 687 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 688 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 689 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 690 } 691 } else { /* PETSCSF_ROOT2LEAF */ 692 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 693 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 694 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 695 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 696 } 697 } 698 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 699 } 700 701 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 702 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 703 if (direction == PETSCSF_LEAF2ROOT) { 704 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 705 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 706 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 707 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 708 } 709 } else { /* PETSCSF_ROOT2LEAF */ 710 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 711 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 712 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 713 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 714 } 715 } 716 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 717 } 718 } 719 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 720 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 721 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 722 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 723 PetscFunctionReturn(0); 724 } 725 726 727 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 728 { 729 PetscErrorCode ierr; 730 PetscSFLink link,*p; 731 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 732 733 PetscFunctionBegin; 734 /* Look for types in cache */ 735 for (p=&bas->inuse; (link=*p); p=&link->next) { 736 PetscBool match; 737 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 738 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 739 switch (cmode) { 740 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 741 case PETSC_USE_POINTER: break; 742 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 743 } 744 *mylink = link; 745 PetscFunctionReturn(0); 746 } 747 } 748 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 749 } 750 751 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 752 { 753 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 754 755 PetscFunctionBegin; 756 (*link)->rootdata = NULL; 757 (*link)->leafdata = NULL; 758 (*link)->next = bas->avail; 759 bas->avail = *link; 760 *link = NULL; 761 PetscFunctionReturn(0); 762 } 763 764 /* Destroy all links chained in 'avail' */ 765 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 766 { 767 PetscErrorCode ierr; 768 PetscSFLink link = *avail,next; 769 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 770 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 771 772 PetscFunctionBegin; 773 for (; link; link=next) { 774 next = link->next; 775 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 776 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 777 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 778 } 779 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 780 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 781 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 782 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 783 #if defined(PETSC_HAVE_DEVICE) 784 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 785 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 786 #endif 787 } 788 #if defined(PETSC_HAVE_DEVICE) 789 if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);} 790 /*TODO: Make runtime */ 791 #if defined(PETSC_HAVE_CUDA) 792 if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;} 793 #elif defined(PETSC_HAVE_HIP) 794 if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRHIP(cerr); link->stream = NULL;} 795 #endif 796 #endif 797 ierr = PetscFree(link);CHKERRQ(ierr); 798 } 799 *avail = NULL; 800 PetscFunctionReturn(0); 801 } 802 803 /* Error out on unsupported overlapped communications */ 804 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 805 { 806 PetscErrorCode ierr; 807 PetscSFLink link,*p; 808 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 809 PetscBool match; 810 811 PetscFunctionBegin; 812 if (PetscDefined(USE_DEBUG)) { 813 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 814 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 815 */ 816 if (rootdata || leafdata) { 817 for (p=&bas->inuse; (link=*p); p=&link->next) { 818 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 819 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 820 } 821 } 822 } 823 PetscFunctionReturn(0); 824 } 825 826 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 827 { 828 PetscFunctionBegin; 829 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 830 PetscFunctionReturn(0); 831 } 832 833 834 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 835 { 836 PetscErrorCode ierr; 837 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 838 PetscBool is2Int,is2PetscInt; 839 PetscMPIInt ni,na,nd,combiner; 840 #if defined(PETSC_HAVE_COMPLEX) 841 PetscInt nPetscComplex=0; 842 #endif 843 844 PetscFunctionBegin; 845 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 846 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 847 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 848 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 849 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 850 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 851 #if defined(PETSC_HAVE_COMPLEX) 852 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 853 #endif 854 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 855 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 856 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 857 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 858 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 859 link->bs = 1; /* default */ 860 861 if (is2Int) { 862 PackInit_PairType_int_int_1_1(link); 863 link->bs = 1; 864 link->unitbytes = 2*sizeof(int); 865 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 866 link->basicunit = MPI_2INT; 867 link->unit = MPI_2INT; 868 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 869 PackInit_PairType_PetscInt_PetscInt_1_1(link); 870 link->bs = 1; 871 link->unitbytes = 2*sizeof(PetscInt); 872 link->basicunit = MPIU_2INT; 873 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 874 link->unit = MPIU_2INT; 875 } else if (nPetscReal) { 876 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 877 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 878 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 879 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 880 link->bs = nPetscReal; 881 link->unitbytes = nPetscReal*sizeof(PetscReal); 882 link->basicunit = MPIU_REAL; 883 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 884 } else if (nPetscInt) { 885 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 886 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 887 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 888 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 889 link->bs = nPetscInt; 890 link->unitbytes = nPetscInt*sizeof(PetscInt); 891 link->basicunit = MPIU_INT; 892 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 893 #if defined(PETSC_USE_64BIT_INDICES) 894 } else if (nInt) { 895 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 896 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 897 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 898 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 899 link->bs = nInt; 900 link->unitbytes = nInt*sizeof(int); 901 link->basicunit = MPI_INT; 902 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 903 #endif 904 } else if (nSignedChar) { 905 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 906 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 907 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 908 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 909 link->bs = nSignedChar; 910 link->unitbytes = nSignedChar*sizeof(SignedChar); 911 link->basicunit = MPI_SIGNED_CHAR; 912 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 913 } else if (nUnsignedChar) { 914 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 915 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 916 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 917 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 918 link->bs = nUnsignedChar; 919 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 920 link->basicunit = MPI_UNSIGNED_CHAR; 921 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 922 #if defined(PETSC_HAVE_COMPLEX) 923 } else if (nPetscComplex) { 924 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 925 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 926 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 927 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 928 link->bs = nPetscComplex; 929 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 930 link->basicunit = MPIU_COMPLEX; 931 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 932 #endif 933 } else { 934 MPI_Aint lb,nbyte; 935 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 936 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 937 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 938 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 939 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 940 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 941 link->bs = nbyte; 942 link->unitbytes = nbyte; 943 link->basicunit = MPI_BYTE; 944 } else { 945 nInt = nbyte / sizeof(int); 946 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 947 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 948 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 949 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 950 link->bs = nInt; 951 link->unitbytes = nbyte; 952 link->basicunit = MPI_INT; 953 } 954 if (link->isbuiltin) link->unit = unit; 955 } 956 957 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 958 959 link->Memcpy = PetscSFLinkMemcpy_Host; 960 PetscFunctionReturn(0); 961 } 962 963 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 964 { 965 PetscFunctionBegin; 966 *UnpackAndOp = NULL; 967 if (mtype == PETSC_MEMTYPE_HOST) { 968 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 969 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 970 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 971 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 972 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 973 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 974 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 975 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 976 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 977 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 978 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 979 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 980 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 981 } 982 #if defined(PETSC_HAVE_DEVICE) 983 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 984 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 985 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 986 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 987 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 988 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 989 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 990 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 991 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 992 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 993 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 994 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 995 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 996 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 997 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 998 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 999 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 1000 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 1001 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 1002 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 1003 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 1004 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 1005 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 1006 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 1007 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1008 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1009 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1010 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1011 } 1012 #endif 1013 PetscFunctionReturn(0); 1014 } 1015 1016 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 1017 { 1018 PetscFunctionBegin; 1019 *ScatterAndOp = NULL; 1020 if (mtype == PETSC_MEMTYPE_HOST) { 1021 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1022 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1023 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1024 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1025 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1026 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1027 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1028 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1029 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1030 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1031 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1032 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1033 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1034 } 1035 #if defined(PETSC_HAVE_DEVICE) 1036 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1037 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1038 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1039 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1040 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1041 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1042 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1043 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1044 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1045 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1046 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1047 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1048 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1049 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1050 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1051 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1052 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1053 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1054 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1055 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1056 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1057 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1058 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1059 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1060 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1061 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1062 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1063 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1064 } 1065 #endif 1066 PetscFunctionReturn(0); 1067 } 1068 1069 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1070 { 1071 PetscFunctionBegin; 1072 *FetchAndOp = NULL; 1073 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1074 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1075 #if defined(PETSC_HAVE_DEVICE) 1076 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1077 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1078 #endif 1079 PetscFunctionReturn(0); 1080 } 1081 1082 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1083 { 1084 PetscFunctionBegin; 1085 *FetchAndOpLocal = NULL; 1086 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1087 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1088 #if defined(PETSC_HAVE_DEVICE) 1089 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1090 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1091 #endif 1092 PetscFunctionReturn(0); 1093 } 1094 1095 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1096 { 1097 PetscErrorCode ierr; 1098 PetscLogDouble flops; 1099 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1100 1101 1102 PetscFunctionBegin; 1103 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1104 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1105 #if defined(PETSC_HAVE_DEVICE) 1106 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1107 #endif 1108 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1109 } 1110 PetscFunctionReturn(0); 1111 } 1112 1113 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1114 { 1115 PetscLogDouble flops; 1116 PetscErrorCode ierr; 1117 1118 PetscFunctionBegin; 1119 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1120 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1121 #if defined(PETSC_HAVE_DEVICE) 1122 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1123 #endif 1124 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1125 } 1126 PetscFunctionReturn(0); 1127 } 1128 1129 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1130 Input Arguments: 1131 +sf - The StarForest 1132 .link - The link 1133 .count - Number of entries to unpack 1134 .start - The first index, significent when indices=NULL 1135 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1136 .buf - A contiguous buffer to unpack from 1137 -op - Operation after unpack 1138 1139 Output Arguments: 1140 .data - The data to unpack to 1141 */ 1142 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1143 { 1144 PetscFunctionBegin; 1145 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1146 { 1147 PetscErrorCode ierr; 1148 PetscInt i; 1149 PetscMPIInt n; 1150 if (indices) { 1151 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1152 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1153 */ 1154 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 1155 } else { 1156 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1157 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); 1158 } 1159 } 1160 PetscFunctionReturn(0); 1161 #else 1162 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1163 #endif 1164 } 1165 1166 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1167 { 1168 PetscFunctionBegin; 1169 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1170 { 1171 PetscErrorCode ierr; 1172 PetscInt i,disp; 1173 if (!srcIdx) { 1174 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1175 } else { 1176 for (i=0; i<count; i++) { 1177 disp = dstIdx? dstIdx[i] : dstStart + i; 1178 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 1179 } 1180 } 1181 } 1182 PetscFunctionReturn(0); 1183 #else 1184 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1185 #endif 1186 } 1187 1188 /*============================================================================= 1189 Pack/Unpack/Fetch/Scatter routines 1190 ============================================================================*/ 1191 1192 /* Pack rootdata to rootbuf 1193 Input Arguments: 1194 + sf - The SF this packing works on. 1195 . link - It gives the memtype of the roots and also provides root buffer. 1196 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1197 - rootdata - Where to read the roots. 1198 1199 Notes: 1200 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1201 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1202 */ 1203 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1204 { 1205 PetscErrorCode ierr; 1206 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1207 const PetscInt *rootindices = NULL; 1208 PetscInt count,start; 1209 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1210 PetscMemType rootmtype = link->rootmtype; 1211 PetscSFPackOpt opt = NULL; 1212 1213 /*TODO: SEK Problems are in here*/ 1214 PetscFunctionBegin; 1215 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1216 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1217 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1218 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1219 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1220 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1221 } 1222 if (scope == PETSCSF_REMOTE) { 1223 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1224 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1225 } 1226 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1227 PetscFunctionReturn(0); 1228 } 1229 1230 /* Pack leafdata to leafbuf */ 1231 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1232 { 1233 PetscErrorCode ierr; 1234 const PetscInt *leafindices = NULL; 1235 PetscInt count,start; 1236 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1237 PetscMemType leafmtype = link->leafmtype; 1238 PetscSFPackOpt opt = NULL; 1239 1240 PetscFunctionBegin; 1241 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1242 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1243 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1244 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1245 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1246 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1247 } 1248 if (scope == PETSCSF_REMOTE) { 1249 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1250 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1251 } 1252 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1253 PetscFunctionReturn(0); 1254 } 1255 1256 /* Unpack rootbuf to rootdata */ 1257 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1258 { 1259 PetscErrorCode ierr; 1260 const PetscInt *rootindices = NULL; 1261 PetscInt count,start; 1262 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1263 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1264 PetscMemType rootmtype = link->rootmtype; 1265 PetscSFPackOpt opt = NULL; 1266 1267 PetscFunctionBegin; 1268 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1269 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1270 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1271 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1272 if (UnpackAndOp) { 1273 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1274 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1275 } else { 1276 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1277 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1278 } 1279 } 1280 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1281 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1282 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1283 PetscFunctionReturn(0); 1284 } 1285 1286 /* Unpack leafbuf to leafdata */ 1287 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1288 { 1289 PetscErrorCode ierr; 1290 const PetscInt *leafindices = NULL; 1291 PetscInt count,start; 1292 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1293 PetscMemType leafmtype = link->leafmtype; 1294 PetscSFPackOpt opt = NULL; 1295 1296 PetscFunctionBegin; 1297 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1298 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1299 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1300 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1301 if (UnpackAndOp) { 1302 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1303 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1304 } else { 1305 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1306 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1307 } 1308 } 1309 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1310 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1311 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1312 PetscFunctionReturn(0); 1313 } 1314 1315 /* FetchAndOp rootdata with rootbuf */ 1316 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1317 { 1318 PetscErrorCode ierr; 1319 const PetscInt *rootindices = NULL; 1320 PetscInt count,start; 1321 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1322 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1323 PetscMemType rootmtype = link->rootmtype; 1324 PetscSFPackOpt opt = NULL; 1325 1326 PetscFunctionBegin; 1327 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1328 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1329 if (bas->rootbuflen[scope]) { 1330 /* Do FetchAndOp on rootdata with rootbuf */ 1331 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1332 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1333 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1334 } 1335 if (scope == PETSCSF_REMOTE) { 1336 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1337 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1338 } 1339 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1340 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1341 PetscFunctionReturn(0); 1342 } 1343 1344 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1345 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1346 { 1347 PetscErrorCode ierr; 1348 const PetscInt *rootindices = NULL,*leafindices = NULL; 1349 PetscInt count,rootstart,leafstart; 1350 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1351 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1352 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1353 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1354 1355 PetscFunctionBegin; 1356 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1357 if (rootmtype != leafmtype) { /* Uncommon case */ 1358 /* The local communication has to go through pack and unpack */ 1359 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1360 ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1361 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1362 } else { 1363 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1364 if (ScatterAndOp) { 1365 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1366 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1367 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1368 } else { 1369 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1370 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1371 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1372 } 1373 } 1374 PetscFunctionReturn(0); 1375 } 1376 1377 /* Reduce leafdata to rootdata locally */ 1378 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1379 { 1380 PetscErrorCode ierr; 1381 const PetscInt *rootindices = NULL,*leafindices = NULL; 1382 PetscInt count,rootstart,leafstart; 1383 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1384 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1385 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1386 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1387 1388 PetscFunctionBegin; 1389 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1390 if (rootmtype != leafmtype) { 1391 /* The local communication has to go through pack and unpack */ 1392 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1393 ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1394 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1395 } else { 1396 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1397 if (ScatterAndOp) { 1398 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1399 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1400 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1401 } else { 1402 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1403 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1404 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1405 } 1406 } 1407 PetscFunctionReturn(0); 1408 } 1409 1410 /* Fetch rootdata to leafdata and leafupdate locally */ 1411 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1412 { 1413 PetscErrorCode ierr; 1414 const PetscInt *rootindices = NULL,*leafindices = NULL; 1415 PetscInt count,rootstart,leafstart; 1416 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1417 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1418 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1419 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1420 1421 PetscFunctionBegin; 1422 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1423 if (rootmtype != leafmtype) { 1424 /* The local communication has to go through pack and unpack */ 1425 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1426 } else { 1427 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1428 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1429 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1430 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1431 } 1432 PetscFunctionReturn(0); 1433 } 1434 1435 /* 1436 Create per-rank pack/unpack optimizations based on indice patterns 1437 1438 Input Parameters: 1439 + n - Number of destination ranks 1440 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1441 - idx - [*] Array storing indices 1442 1443 Output Parameters: 1444 + opt - Pack optimizations. NULL if no optimizations. 1445 */ 1446 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1447 { 1448 PetscErrorCode ierr; 1449 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1450 PetscBool optimizable = PETSC_TRUE; 1451 PetscSFPackOpt opt; 1452 1453 PetscFunctionBegin; 1454 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1455 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1456 opt->n = opt->array[0] = n; 1457 opt->offset = opt->array + 1; 1458 opt->start = opt->array + n + 2; 1459 opt->dx = opt->array + 2*n + 2; 1460 opt->dy = opt->array + 3*n + 2; 1461 opt->dz = opt->array + 4*n + 2; 1462 opt->X = opt->array + 5*n + 2; 1463 opt->Y = opt->array + 6*n + 2; 1464 1465 for (r=0; r<n; r++) { /* For each destination rank */ 1466 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1467 p = offset[r]; 1468 start = idx[p]; /* First index for this rank */ 1469 p++; 1470 1471 /* Search in X dimension */ 1472 for (dx=1; dx<m; dx++,p++) { 1473 if (start+dx != idx[p]) break; 1474 } 1475 1476 dydz = m/dx; 1477 X = dydz > 1 ? (idx[p]-start) : dx; 1478 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1479 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1480 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1481 for (i=0; i<dx; i++,p++) { 1482 if (start+X*dy+i != idx[p]) { 1483 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1484 else goto Z_dimension; 1485 } 1486 } 1487 } 1488 1489 Z_dimension: 1490 dz = m/(dx*dy); 1491 Y = dz > 1 ? (idx[p]-start)/X : dy; 1492 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1493 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1494 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1495 for (j=0; j<dy; j++) { 1496 for (i=0; i<dx; i++,p++) { 1497 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1498 } 1499 } 1500 } 1501 opt->start[r] = start; 1502 opt->dx[r] = dx; 1503 opt->dy[r] = dy; 1504 opt->dz[r] = dz; 1505 opt->X[r] = X; 1506 opt->Y[r] = Y; 1507 } 1508 1509 finish: 1510 /* If not optimizable, free arrays to save memory */ 1511 if (!n || !optimizable) { 1512 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1513 ierr = PetscFree(opt);CHKERRQ(ierr); 1514 *out = NULL; 1515 } else { 1516 opt->offset[0] = 0; 1517 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1518 *out = opt; 1519 } 1520 PetscFunctionReturn(0); 1521 } 1522 1523 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1524 { 1525 PetscErrorCode ierr; 1526 PetscSFPackOpt opt = *out; 1527 1528 PetscFunctionBegin; 1529 if (opt) { 1530 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1531 ierr = PetscFree(opt);CHKERRQ(ierr); 1532 *out = NULL; 1533 } 1534 PetscFunctionReturn(0); 1535 } 1536 1537 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1538 { 1539 PetscErrorCode ierr; 1540 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1541 PetscInt i,j; 1542 1543 PetscFunctionBegin; 1544 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1545 for (i=0; i<2; i++) { /* Set defaults */ 1546 sf->leafstart[i] = 0; 1547 sf->leafcontig[i] = PETSC_TRUE; 1548 sf->leafdups[i] = PETSC_FALSE; 1549 bas->rootstart[i] = 0; 1550 bas->rootcontig[i] = PETSC_TRUE; 1551 bas->rootdups[i] = PETSC_FALSE; 1552 } 1553 1554 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1555 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1556 1557 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1558 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1559 1560 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1561 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1562 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1563 } 1564 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1565 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1566 } 1567 1568 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1569 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1570 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1571 1572 /* Are root indices for self and remote contiguous? */ 1573 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1574 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1575 1576 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1577 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1578 1579 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1580 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1581 } 1582 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1583 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1584 } 1585 1586 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1587 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1588 1589 #if defined(PETSC_HAVE_DEVICE) 1590 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1591 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1592 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1593 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1594 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1595 #endif 1596 1597 PetscFunctionReturn(0); 1598 } 1599 1600 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1601 { 1602 PetscErrorCode ierr; 1603 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1604 PetscInt i; 1605 1606 PetscFunctionBegin; 1607 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1608 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1609 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1610 #if defined(PETSC_HAVE_DEVICE) 1611 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1612 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1613 #endif 1614 } 1615 PetscFunctionReturn(0); 1616 } 1617