1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 #if defined(PETSC_HAVE_CUDA) 8 #include <cuda_runtime.h> 9 #include <petsccublas.h> 10 #endif 11 #if defined(PETSC_HAVE_HIP) 12 #include <hip/hip_runtime.h> 13 #include <petschipblas.h> 14 #endif 15 /* 16 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 17 * therefore we pack data types manually. This file defines packing routines for the standard data types. 18 */ 19 20 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 21 22 /* Operations working like s += t */ 23 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 25 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 26 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 27 /* Ref MPI MAXLOC */ 28 #define OP_XLOC(op,s,t) \ 29 do { \ 30 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 31 else if (!((s).u op (t).u)) s = t; \ 32 } while (0) 33 34 /* DEF_PackFunc - macro defining a Pack routine 35 36 Arguments of the macro: 37 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 38 .BS Block size for vectorization. It is a factor of bsz. 39 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 40 41 Arguments of the Pack routine: 42 +count Number of indices in idx[]. 43 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 44 .opt Per-pack optimization plan. NULL means no such plan. 45 .idx Indices of entries to packed. 46 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 47 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 48 -packed Address of the packed data. 49 */ 50 #define DEF_PackFunc(Type,BS,EQ) \ 51 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 52 { \ 53 PetscErrorCode ierr; \ 54 const Type *u = (const Type*)unpacked,*u2; \ 55 Type *p = (Type*)packed,*p2; \ 56 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 57 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 58 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 59 PetscFunctionBegin; \ 60 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 61 else if (opt) { /* has optimizations available */ \ 62 p2 = p; \ 63 for (r=0; r<opt->n; r++) { \ 64 u2 = u + opt->start[r]*MBS; \ 65 X = opt->X[r]; \ 66 Y = opt->Y[r]; \ 67 for (k=0; k<opt->dz[r]; k++) \ 68 for (j=0; j<opt->dy[r]; j++) { \ 69 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 70 p2 += opt->dx[r]*MBS; \ 71 } \ 72 } \ 73 } else { \ 74 for (i=0; i<count; i++) \ 75 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 76 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 77 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 78 } \ 79 PetscFunctionReturn(0); \ 80 } 81 82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 83 and inserts into a sparse array. 84 85 Arguments: 86 .Type Type of the data 87 .BS Block size for vectorization 88 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 89 90 Notes: 91 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 92 */ 93 #define DEF_UnpackFunc(Type,BS,EQ) \ 94 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 95 { \ 96 PetscErrorCode ierr; \ 97 Type *u = (Type*)unpacked,*u2; \ 98 const Type *p = (const Type*)packed; \ 99 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 100 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 101 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 102 PetscFunctionBegin; \ 103 if (!idx) { \ 104 u += start*MBS; \ 105 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 106 } else if (opt) { /* has optimizations available */ \ 107 for (r=0; r<opt->n; r++) { \ 108 u2 = u + opt->start[r]*MBS; \ 109 X = opt->X[r]; \ 110 Y = opt->Y[r]; \ 111 for (k=0; k<opt->dz[r]; k++) \ 112 for (j=0; j<opt->dy[r]; j++) { \ 113 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 114 p += opt->dx[r]*MBS; \ 115 } \ 116 } \ 117 } else { \ 118 for (i=0; i<count; i++) \ 119 for (j=0; j<M; j++) \ 120 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 121 } \ 122 PetscFunctionReturn(0); \ 123 } 124 125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 126 127 Arguments: 128 +Opname Name of the Op, such as Add, Mult, LAND, etc. 129 .Type Type of the data 130 .BS Block size for vectorization 131 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 132 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 133 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 134 */ 135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 136 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 137 { \ 138 Type *u = (Type*)unpacked,*u2; \ 139 const Type *p = (const Type*)packed; \ 140 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 141 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 142 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 143 PetscFunctionBegin; \ 144 if (!idx) { \ 145 u += start*MBS; \ 146 for (i=0; i<count; i++) \ 147 for (j=0; j<M; j++) \ 148 for (k=0; k<BS; k++) \ 149 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 150 } else if (opt) { /* idx[] has patterns */ \ 151 for (r=0; r<opt->n; r++) { \ 152 u2 = u + opt->start[r]*MBS; \ 153 X = opt->X[r]; \ 154 Y = opt->Y[r]; \ 155 for (k=0; k<opt->dz[r]; k++) \ 156 for (j=0; j<opt->dy[r]; j++) { \ 157 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 158 p += opt->dx[r]*MBS; \ 159 } \ 160 } \ 161 } else { \ 162 for (i=0; i<count; i++) \ 163 for (j=0; j<M; j++) \ 164 for (k=0; k<BS; k++) \ 165 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 171 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 172 { \ 173 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 174 PetscInt i,j,k,r,l,bs=link->bs; \ 175 const PetscInt M = (EQ) ? 1 : bs/BS; \ 176 const PetscInt MBS = M*BS; \ 177 PetscFunctionBegin; \ 178 for (i=0; i<count; i++) { \ 179 r = (!idx ? start+i : idx[i])*MBS; \ 180 l = i*MBS; \ 181 for (j=0; j<M; j++) \ 182 for (k=0; k<BS; k++) { \ 183 tmp = u[r+j*BS+k]; \ 184 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 185 p[l+j*BS+k] = tmp; \ 186 } \ 187 } \ 188 PetscFunctionReturn(0); \ 189 } 190 191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 192 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 193 { \ 194 PetscErrorCode ierr; \ 195 const Type *u = (const Type*)src; \ 196 Type *v = (Type*)dst; \ 197 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 198 const PetscInt M = (EQ) ? 1 : bs/BS; \ 199 const PetscInt MBS = M*BS; \ 200 PetscFunctionBegin; \ 201 if (!srcIdx) { /* src is contiguous */ \ 202 u += srcStart*MBS; \ 203 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 204 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 205 u += srcOpt->start[0]*MBS; \ 206 v += dstStart*MBS; \ 207 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 208 for (k=0; k<srcOpt->dz[0]; k++) \ 209 for (j=0; j<srcOpt->dy[0]; j++) { \ 210 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 211 v += srcOpt->dx[0]*MBS; \ 212 } \ 213 } else { /* all other cases */ \ 214 for (i=0; i<count; i++) { \ 215 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 216 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 217 for (j=0; j<M; j++) \ 218 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 219 } \ 220 } \ 221 PetscFunctionReturn(0); \ 222 } 223 224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 225 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 226 { \ 227 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 228 const Type *ldata = (const Type*)leafdata; \ 229 PetscInt i,j,k,r,l,bs = link->bs; \ 230 const PetscInt M = (EQ) ? 1 : bs/BS; \ 231 const PetscInt MBS = M*BS; \ 232 PetscFunctionBegin; \ 233 for (i=0; i<count; i++) { \ 234 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 235 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 239 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 240 } \ 241 } \ 242 PetscFunctionReturn(0); \ 243 } 244 245 /* Pack, Unpack/Fetch ops */ 246 #define DEF_Pack(Type,BS,EQ) \ 247 DEF_PackFunc(Type,BS,EQ) \ 248 DEF_UnpackFunc(Type,BS,EQ) \ 249 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 250 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 251 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 252 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 253 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 254 } 255 256 /* Add, Mult ops */ 257 #define DEF_Add(Type,BS,EQ) \ 258 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 259 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 260 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 261 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 262 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 263 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 264 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 265 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 266 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 267 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 268 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 269 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 270 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 271 } 272 273 /* Max, Min ops */ 274 #define DEF_Cmp(Type,BS,EQ) \ 275 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 276 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 277 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 278 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 279 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 280 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 281 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 282 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 283 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 284 } 285 286 /* Logical ops. 287 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 288 the compilation warning "empty macro arguments are undefined in ISO C90" 289 */ 290 #define DEF_Log(Type,BS,EQ) \ 291 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 292 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 293 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 294 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 295 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 296 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 297 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 298 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 299 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 300 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 301 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 302 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 303 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 304 } 305 306 /* Bitwise ops */ 307 #define DEF_Bit(Type,BS,EQ) \ 308 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 309 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 310 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 311 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 312 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 313 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 314 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 315 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 316 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 317 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 318 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 319 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 320 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 321 } 322 323 /* Maxloc, Minloc ops */ 324 #define DEF_Xloc(Type,BS,EQ) \ 325 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 326 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 327 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 328 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 329 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 330 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 331 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 332 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 333 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 334 } 335 336 #define DEF_IntegerType(Type,BS,EQ) \ 337 DEF_Pack(Type,BS,EQ) \ 338 DEF_Add(Type,BS,EQ) \ 339 DEF_Cmp(Type,BS,EQ) \ 340 DEF_Log(Type,BS,EQ) \ 341 DEF_Bit(Type,BS,EQ) \ 342 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 343 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 344 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 345 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 346 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 347 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 348 } 349 350 #define DEF_RealType(Type,BS,EQ) \ 351 DEF_Pack(Type,BS,EQ) \ 352 DEF_Add(Type,BS,EQ) \ 353 DEF_Cmp(Type,BS,EQ) \ 354 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 355 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 356 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 357 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 358 } 359 360 #if defined(PETSC_HAVE_COMPLEX) 361 #define DEF_ComplexType(Type,BS,EQ) \ 362 DEF_Pack(Type,BS,EQ) \ 363 DEF_Add(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 367 } 368 #endif 369 370 #define DEF_DumbType(Type,BS,EQ) \ 371 DEF_Pack(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 } 375 376 /* Maxloc, Minloc */ 377 #define DEF_PairType(Type,BS,EQ) \ 378 DEF_Pack(Type,BS,EQ) \ 379 DEF_Xloc(Type,BS,EQ) \ 380 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 381 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 382 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 383 } 384 385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 393 394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 395 DEF_IntegerType(int,1,1) 396 DEF_IntegerType(int,2,1) 397 DEF_IntegerType(int,4,1) 398 DEF_IntegerType(int,8,1) 399 DEF_IntegerType(int,1,0) 400 DEF_IntegerType(int,2,0) 401 DEF_IntegerType(int,4,0) 402 DEF_IntegerType(int,8,0) 403 #endif 404 405 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 406 typedef signed char SignedChar; 407 DEF_IntegerType(SignedChar,1,1) 408 DEF_IntegerType(SignedChar,2,1) 409 DEF_IntegerType(SignedChar,4,1) 410 DEF_IntegerType(SignedChar,8,1) 411 DEF_IntegerType(SignedChar,1,0) 412 DEF_IntegerType(SignedChar,2,0) 413 DEF_IntegerType(SignedChar,4,0) 414 DEF_IntegerType(SignedChar,8,0) 415 416 typedef unsigned char UnsignedChar; 417 DEF_IntegerType(UnsignedChar,1,1) 418 DEF_IntegerType(UnsignedChar,2,1) 419 DEF_IntegerType(UnsignedChar,4,1) 420 DEF_IntegerType(UnsignedChar,8,1) 421 DEF_IntegerType(UnsignedChar,1,0) 422 DEF_IntegerType(UnsignedChar,2,0) 423 DEF_IntegerType(UnsignedChar,4,0) 424 DEF_IntegerType(UnsignedChar,8,0) 425 426 DEF_RealType(PetscReal,1,1) 427 DEF_RealType(PetscReal,2,1) 428 DEF_RealType(PetscReal,4,1) 429 DEF_RealType(PetscReal,8,1) 430 DEF_RealType(PetscReal,1,0) 431 DEF_RealType(PetscReal,2,0) 432 DEF_RealType(PetscReal,4,0) 433 DEF_RealType(PetscReal,8,0) 434 435 #if defined(PETSC_HAVE_COMPLEX) 436 DEF_ComplexType(PetscComplex,1,1) 437 DEF_ComplexType(PetscComplex,2,1) 438 DEF_ComplexType(PetscComplex,4,1) 439 DEF_ComplexType(PetscComplex,8,1) 440 DEF_ComplexType(PetscComplex,1,0) 441 DEF_ComplexType(PetscComplex,2,0) 442 DEF_ComplexType(PetscComplex,4,0) 443 DEF_ComplexType(PetscComplex,8,0) 444 #endif 445 446 #define PairType(Type1,Type2) Type1##_##Type2 447 typedef struct {int u; int i;} PairType(int,int); 448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 449 DEF_PairType(PairType(int,int),1,1) 450 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 451 452 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 453 DEF_DumbType(char,1,1) 454 DEF_DumbType(char,2,1) 455 DEF_DumbType(char,4,1) 456 DEF_DumbType(char,1,0) 457 DEF_DumbType(char,2,0) 458 DEF_DumbType(char,4,0) 459 460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 461 DEF_DumbType(DumbInt,1,1) 462 DEF_DumbType(DumbInt,2,1) 463 DEF_DumbType(DumbInt,4,1) 464 DEF_DumbType(DumbInt,8,1) 465 DEF_DumbType(DumbInt,1,0) 466 DEF_DumbType(DumbInt,2,0) 467 DEF_DumbType(DumbInt,4,0) 468 DEF_DumbType(DumbInt,8,0) 469 470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 472 { 473 int ierr; 474 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 475 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 476 return MPI_SUCCESS; 477 } 478 #endif 479 480 /* 481 The routine Creates a communication link for the given operation. It first looks up its link cache. If 482 there is a free & suitable one, it uses it. Otherwise it creates a new one. 483 484 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 485 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 486 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 487 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 488 489 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 490 491 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 492 493 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 494 need pack/unpack data. 495 */ 496 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 497 { 498 PetscErrorCode ierr; 499 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 500 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 501 PetscSFLink *p,link; 502 PetscSFDirection direction; 503 MPI_Request *reqs = NULL; 504 PetscBool match,rootdirect[2],leafdirect[2]; 505 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/ 506 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 507 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 508 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 509 510 PetscFunctionBegin; 511 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 512 513 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 514 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 515 if (sfop == PETSCSF_BCAST) { 516 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 517 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 518 } else if (sfop == PETSCSF_REDUCE) { 519 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 520 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 521 } else { /* PETSCSF_FETCH */ 522 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 523 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 524 } 525 } 526 527 if (sf->use_gpu_aware_mpi) { 528 rootmtype_mpi = rootmtype; 529 leafmtype_mpi = leafmtype; 530 } else { 531 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 532 } 533 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 534 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 535 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 536 537 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 538 nrootreqs = bas->nrootreqs; 539 nleafreqs = sf->nleafreqs; 540 541 /* Look for free links in cache */ 542 for (p=&bas->avail; (link=*p); p=&link->next) { 543 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 544 if (match) { 545 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 546 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 547 */ 548 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 549 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 550 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}} 551 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 552 } 553 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 554 reqs = link->leafreqs[direction][leafmtype][1]; 555 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}} 556 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 557 } 558 *p = link->next; /* Remove from available list */ 559 goto found; 560 } 561 } 562 563 ierr = PetscNew(&link);CHKERRQ(ierr); 564 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 565 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 566 567 nreqs = (nrootreqs+nleafreqs)*8; 568 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 569 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 570 571 for (i=0; i<2; i++) { /* Two communication directions */ 572 for (j=0; j<2; j++) { /* Two memory types */ 573 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 574 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 575 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 576 } 577 } 578 } 579 580 found: 581 582 #if defined(PETSC_HAVE_DEVICE) 583 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 584 #if defined(PETSC_HAVE_CUDA) 585 if (sf->backend == PETSCSF_BACKEND_CUDA) {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);} 586 #endif 587 #if defined(PETSC_HAVE_HIP) 588 if (sf->backend == PETSCSF_BACKEND_HIP) {ierr = PetscSFLinkSetUp_Hip(sf,link,unit);CHKERRQ(ierr);} 589 #endif 590 #if defined(PETSC_HAVE_KOKKOS) 591 if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);} 592 #endif 593 } 594 #endif 595 596 /* Allocate buffers along root/leafdata */ 597 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 598 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 599 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 600 if (bas->rootbuflen[i]) { 601 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 602 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 603 } else { /* Have to have a separate rootbuf */ 604 if (!link->rootbuf_alloc[i][rootmtype]) { 605 ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 606 } 607 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 608 } 609 } 610 611 if (sf->leafbuflen[i]) { 612 if (leafdirect[i]) { 613 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 614 } else { 615 if (!link->leafbuf_alloc[i][leafmtype]) { 616 ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 617 } 618 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 619 } 620 } 621 } 622 623 #if defined(PETSC_HAVE_DEVICE) 624 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 625 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 626 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 627 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 628 } 629 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 630 } 631 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 632 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 633 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 634 } 635 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 636 } 637 #endif 638 639 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 640 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 641 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 642 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 643 } 644 645 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 646 link->leafdata = leafdata; 647 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 648 link->rootdirect[i] = rootdirect[i]; 649 link->leafdirect[i] = leafdirect[i]; 650 } 651 link->rootdirect_mpi = rootdirect_mpi; 652 link->leafdirect_mpi = leafdirect_mpi; 653 link->rootmtype = rootmtype; 654 link->leafmtype = leafmtype; 655 link->rootmtype_mpi = rootmtype_mpi; 656 link->leafmtype_mpi = leafmtype_mpi; 657 658 link->next = bas->inuse; 659 bas->inuse = link; 660 *mylink = link; 661 PetscFunctionReturn(0); 662 } 663 664 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 665 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 666 */ 667 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 668 { 669 PetscErrorCode ierr; 670 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 671 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 672 const PetscInt *rootoffset,*leafoffset; 673 PetscMPIInt n; 674 MPI_Aint disp; 675 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 676 MPI_Datatype unit = link->unit; 677 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 678 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 679 680 PetscFunctionBegin; 681 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 682 if (sf->persistent) { 683 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 684 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 685 if (direction == PETSCSF_LEAF2ROOT) { 686 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 687 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 688 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 689 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 690 } 691 } else { /* PETSCSF_ROOT2LEAF */ 692 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 693 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 694 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 695 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 696 } 697 } 698 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 699 } 700 701 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 702 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 703 if (direction == PETSCSF_LEAF2ROOT) { 704 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 705 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 706 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 707 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 708 } 709 } else { /* PETSCSF_ROOT2LEAF */ 710 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 711 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 712 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 713 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 714 } 715 } 716 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 717 } 718 } 719 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 720 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 721 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 722 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 723 PetscFunctionReturn(0); 724 } 725 726 727 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 728 { 729 PetscErrorCode ierr; 730 PetscSFLink link,*p; 731 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 732 733 PetscFunctionBegin; 734 /* Look for types in cache */ 735 for (p=&bas->inuse; (link=*p); p=&link->next) { 736 PetscBool match; 737 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 738 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 739 switch (cmode) { 740 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 741 case PETSC_USE_POINTER: break; 742 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 743 } 744 *mylink = link; 745 PetscFunctionReturn(0); 746 } 747 } 748 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 749 } 750 751 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 752 { 753 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 754 755 PetscFunctionBegin; 756 (*link)->rootdata = NULL; 757 (*link)->leafdata = NULL; 758 (*link)->next = bas->avail; 759 bas->avail = *link; 760 *link = NULL; 761 PetscFunctionReturn(0); 762 } 763 764 /* Destroy all links chained in 'avail' */ 765 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 766 { 767 PetscErrorCode ierr; 768 PetscSFLink link = *avail,next; 769 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 770 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 771 772 PetscFunctionBegin; 773 for (; link; link=next) { 774 next = link->next; 775 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 776 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 777 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 778 } 779 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 780 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 781 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 782 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 783 #if defined(PETSC_HAVE_DEVICE) 784 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 785 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 786 #endif 787 } 788 #if defined(PETSC_HAVE_DEVICE) 789 if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);} 790 /*TODO: Make runtime */ 791 #if defined(PETSC_HAVE_CUDA) 792 if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;} 793 #elif defined(PETSC_HAVE_HIP) 794 if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRHIP(cerr); link->stream = NULL;} 795 #endif 796 #endif 797 ierr = PetscFree(link);CHKERRQ(ierr); 798 } 799 *avail = NULL; 800 PetscFunctionReturn(0); 801 } 802 803 /* Error out on unsupported overlapped communications */ 804 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 805 { 806 PetscErrorCode ierr; 807 PetscSFLink link,*p; 808 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 809 PetscBool match; 810 811 PetscFunctionBegin; 812 if (PetscDefined(USE_DEBUG)) { 813 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 814 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 815 */ 816 if (rootdata || leafdata) { 817 for (p=&bas->inuse; (link=*p); p=&link->next) { 818 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 819 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 820 } 821 } 822 } 823 PetscFunctionReturn(0); 824 } 825 826 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 827 { 828 PetscFunctionBegin; 829 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 830 PetscFunctionReturn(0); 831 } 832 833 834 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 835 { 836 PetscErrorCode ierr; 837 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 838 PetscBool is2Int,is2PetscInt; 839 PetscMPIInt ni,na,nd,combiner; 840 #if defined(PETSC_HAVE_COMPLEX) 841 PetscInt nPetscComplex=0; 842 #endif 843 844 PetscFunctionBegin; 845 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 846 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 847 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 848 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 849 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 850 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 851 #if defined(PETSC_HAVE_COMPLEX) 852 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 853 #endif 854 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 855 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 856 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 857 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 858 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 859 link->bs = 1; /* default */ 860 861 if (is2Int) { 862 PackInit_PairType_int_int_1_1(link); 863 link->bs = 1; 864 link->unitbytes = 2*sizeof(int); 865 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 866 link->basicunit = MPI_2INT; 867 link->unit = MPI_2INT; 868 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 869 PackInit_PairType_PetscInt_PetscInt_1_1(link); 870 link->bs = 1; 871 link->unitbytes = 2*sizeof(PetscInt); 872 link->basicunit = MPIU_2INT; 873 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 874 link->unit = MPIU_2INT; 875 } else if (nPetscReal) { 876 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 877 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 878 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 879 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 880 link->bs = nPetscReal; 881 link->unitbytes = nPetscReal*sizeof(PetscReal); 882 link->basicunit = MPIU_REAL; 883 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 884 } else if (nPetscInt) { 885 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 886 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 887 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 888 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 889 link->bs = nPetscInt; 890 link->unitbytes = nPetscInt*sizeof(PetscInt); 891 link->basicunit = MPIU_INT; 892 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 893 #if defined(PETSC_USE_64BIT_INDICES) 894 } else if (nInt) { 895 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 896 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 897 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 898 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 899 link->bs = nInt; 900 link->unitbytes = nInt*sizeof(int); 901 link->basicunit = MPI_INT; 902 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 903 #endif 904 } else if (nSignedChar) { 905 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 906 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 907 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 908 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 909 link->bs = nSignedChar; 910 link->unitbytes = nSignedChar*sizeof(SignedChar); 911 link->basicunit = MPI_SIGNED_CHAR; 912 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 913 } else if (nUnsignedChar) { 914 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 915 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 916 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 917 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 918 link->bs = nUnsignedChar; 919 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 920 link->basicunit = MPI_UNSIGNED_CHAR; 921 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 922 #if defined(PETSC_HAVE_COMPLEX) 923 } else if (nPetscComplex) { 924 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 925 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 926 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 927 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 928 link->bs = nPetscComplex; 929 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 930 link->basicunit = MPIU_COMPLEX; 931 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 932 #endif 933 } else { 934 MPI_Aint lb,nbyte; 935 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 936 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 937 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 938 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 939 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 940 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 941 link->bs = nbyte; 942 link->unitbytes = nbyte; 943 link->basicunit = MPI_BYTE; 944 } else { 945 nInt = nbyte / sizeof(int); 946 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 947 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 948 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 949 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 950 link->bs = nInt; 951 link->unitbytes = nbyte; 952 link->basicunit = MPI_INT; 953 } 954 if (link->isbuiltin) link->unit = unit; 955 } 956 957 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 958 959 link->Memcpy = PetscSFLinkMemcpy_Host; 960 PetscFunctionReturn(0); 961 } 962 963 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 964 { 965 PetscFunctionBegin; 966 *UnpackAndOp = NULL; 967 if (mtype == PETSC_MEMTYPE_HOST) { 968 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 969 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 970 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 971 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 972 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 973 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 974 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 975 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 976 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 977 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 978 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 979 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 980 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 981 } 982 #if defined(PETSC_HAVE_DEVICE) 983 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 984 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 985 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 986 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 987 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 988 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 989 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 990 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 991 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 992 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 993 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 994 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 995 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 996 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 997 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 998 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 999 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 1000 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 1001 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 1002 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 1003 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 1004 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 1005 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 1006 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 1007 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1008 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1009 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1010 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1011 } 1012 #endif 1013 PetscFunctionReturn(0); 1014 } 1015 1016 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 1017 { 1018 PetscFunctionBegin; 1019 *ScatterAndOp = NULL; 1020 if (mtype == PETSC_MEMTYPE_HOST) { 1021 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1022 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1023 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1024 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1025 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1026 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1027 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1028 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1029 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1030 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1031 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1032 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1033 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1034 } 1035 #if defined(PETSC_HAVE_DEVICE) 1036 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1037 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1038 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1039 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1040 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1041 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1042 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1043 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1044 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1045 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1046 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1047 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1048 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1049 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1050 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1051 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1052 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1053 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1054 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1055 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1056 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1057 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1058 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1059 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1060 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1061 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1062 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1063 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1064 } 1065 #endif 1066 PetscFunctionReturn(0); 1067 } 1068 1069 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1070 { 1071 PetscFunctionBegin; 1072 *FetchAndOp = NULL; 1073 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1074 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1075 #if defined(PETSC_HAVE_DEVICE) 1076 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1077 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1078 #endif 1079 PetscFunctionReturn(0); 1080 } 1081 1082 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1083 { 1084 PetscFunctionBegin; 1085 *FetchAndOpLocal = NULL; 1086 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1087 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1088 #if defined(PETSC_HAVE_DEVICE) 1089 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1090 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1091 #endif 1092 PetscFunctionReturn(0); 1093 } 1094 1095 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1096 { 1097 PetscErrorCode ierr; 1098 PetscLogDouble flops; 1099 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1100 1101 1102 PetscFunctionBegin; 1103 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1104 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1105 #if defined(PETSC_HAVE_DEVICE) 1106 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1107 #endif 1108 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1109 } 1110 PetscFunctionReturn(0); 1111 } 1112 1113 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1114 { 1115 PetscLogDouble flops; 1116 PetscErrorCode ierr; 1117 1118 PetscFunctionBegin; 1119 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1120 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1121 #if defined(PETSC_HAVE_DEVICE) 1122 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1123 #endif 1124 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1125 } 1126 PetscFunctionReturn(0); 1127 } 1128 1129 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1130 Input Arguments: 1131 +sf - The StarForest 1132 .link - The link 1133 .count - Number of entries to unpack 1134 .start - The first index, significent when indices=NULL 1135 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1136 .buf - A contiguous buffer to unpack from 1137 -op - Operation after unpack 1138 1139 Output Arguments: 1140 .data - The data to unpack to 1141 */ 1142 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1143 { 1144 PetscFunctionBegin; 1145 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1146 { 1147 PetscErrorCode ierr; 1148 PetscInt i; 1149 PetscMPIInt n; 1150 if (indices) { 1151 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1152 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1153 */ 1154 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 1155 } else { 1156 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1157 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); 1158 } 1159 } 1160 PetscFunctionReturn(0); 1161 #else 1162 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1163 #endif 1164 } 1165 1166 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1167 { 1168 PetscFunctionBegin; 1169 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1170 { 1171 PetscErrorCode ierr; 1172 PetscInt i,disp; 1173 if (!srcIdx) { 1174 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1175 } else { 1176 for (i=0; i<count; i++) { 1177 disp = dstIdx? dstIdx[i] : dstStart + i; 1178 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 1179 } 1180 } 1181 } 1182 PetscFunctionReturn(0); 1183 #else 1184 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1185 #endif 1186 } 1187 1188 /*============================================================================= 1189 Pack/Unpack/Fetch/Scatter routines 1190 ============================================================================*/ 1191 1192 /* Pack rootdata to rootbuf 1193 Input Arguments: 1194 + sf - The SF this packing works on. 1195 . link - It gives the memtype of the roots and also provides root buffer. 1196 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1197 - rootdata - Where to read the roots. 1198 1199 Notes: 1200 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1201 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1202 */ 1203 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1204 { 1205 PetscErrorCode ierr; 1206 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1207 const PetscInt *rootindices = NULL; 1208 PetscInt count,start; 1209 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1210 PetscMemType rootmtype = link->rootmtype; 1211 PetscSFPackOpt opt = NULL; 1212 1213 PetscFunctionBegin; 1214 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1215 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1216 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1217 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1218 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1219 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1220 } 1221 if (scope == PETSCSF_REMOTE) { 1222 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1223 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1224 } 1225 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1226 PetscFunctionReturn(0); 1227 } 1228 1229 /* Pack leafdata to leafbuf */ 1230 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1231 { 1232 PetscErrorCode ierr; 1233 const PetscInt *leafindices = NULL; 1234 PetscInt count,start; 1235 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1236 PetscMemType leafmtype = link->leafmtype; 1237 PetscSFPackOpt opt = NULL; 1238 1239 PetscFunctionBegin; 1240 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1241 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1242 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1243 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1244 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1245 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1246 } 1247 if (scope == PETSCSF_REMOTE) { 1248 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1249 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1250 } 1251 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 /* Unpack rootbuf to rootdata */ 1256 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1257 { 1258 PetscErrorCode ierr; 1259 const PetscInt *rootindices = NULL; 1260 PetscInt count,start; 1261 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1262 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1263 PetscMemType rootmtype = link->rootmtype; 1264 PetscSFPackOpt opt = NULL; 1265 1266 PetscFunctionBegin; 1267 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1268 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1269 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1270 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1271 if (UnpackAndOp) { 1272 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1273 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1274 } else { 1275 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1276 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1277 } 1278 } 1279 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1280 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1281 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1282 PetscFunctionReturn(0); 1283 } 1284 1285 /* Unpack leafbuf to leafdata */ 1286 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1287 { 1288 PetscErrorCode ierr; 1289 const PetscInt *leafindices = NULL; 1290 PetscInt count,start; 1291 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1292 PetscMemType leafmtype = link->leafmtype; 1293 PetscSFPackOpt opt = NULL; 1294 1295 PetscFunctionBegin; 1296 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1297 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1298 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1299 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1300 if (UnpackAndOp) { 1301 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1302 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1303 } else { 1304 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1305 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1306 } 1307 } 1308 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1309 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1310 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1311 PetscFunctionReturn(0); 1312 } 1313 1314 /* FetchAndOp rootdata with rootbuf */ 1315 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1316 { 1317 PetscErrorCode ierr; 1318 const PetscInt *rootindices = NULL; 1319 PetscInt count,start; 1320 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1321 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1322 PetscMemType rootmtype = link->rootmtype; 1323 PetscSFPackOpt opt = NULL; 1324 1325 PetscFunctionBegin; 1326 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1327 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1328 if (bas->rootbuflen[scope]) { 1329 /* Do FetchAndOp on rootdata with rootbuf */ 1330 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1331 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1332 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1333 } 1334 if (scope == PETSCSF_REMOTE) { 1335 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1336 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1337 } 1338 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1339 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1340 PetscFunctionReturn(0); 1341 } 1342 1343 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1344 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1345 { 1346 PetscErrorCode ierr; 1347 const PetscInt *rootindices = NULL,*leafindices = NULL; 1348 PetscInt count,rootstart,leafstart; 1349 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1350 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1351 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1352 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1353 1354 PetscFunctionBegin; 1355 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1356 if (rootmtype != leafmtype) { /* Uncommon case */ 1357 /* The local communication has to go through pack and unpack */ 1358 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1359 ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1360 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1361 } else { 1362 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1363 if (ScatterAndOp) { 1364 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1365 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1366 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1367 } else { 1368 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1369 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1370 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1371 } 1372 } 1373 PetscFunctionReturn(0); 1374 } 1375 1376 /* Reduce leafdata to rootdata locally */ 1377 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1378 { 1379 PetscErrorCode ierr; 1380 const PetscInt *rootindices = NULL,*leafindices = NULL; 1381 PetscInt count,rootstart,leafstart; 1382 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1383 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1384 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1385 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1386 1387 PetscFunctionBegin; 1388 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1389 if (rootmtype != leafmtype) { 1390 /* The local communication has to go through pack and unpack */ 1391 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1392 ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1393 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1394 } else { 1395 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1396 if (ScatterAndOp) { 1397 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1398 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1399 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1400 } else { 1401 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1402 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1403 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1404 } 1405 } 1406 PetscFunctionReturn(0); 1407 } 1408 1409 /* Fetch rootdata to leafdata and leafupdate locally */ 1410 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1411 { 1412 PetscErrorCode ierr; 1413 const PetscInt *rootindices = NULL,*leafindices = NULL; 1414 PetscInt count,rootstart,leafstart; 1415 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1416 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1417 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1418 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1419 1420 PetscFunctionBegin; 1421 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1422 if (rootmtype != leafmtype) { 1423 /* The local communication has to go through pack and unpack */ 1424 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1425 } else { 1426 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1427 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1428 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1429 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1430 } 1431 PetscFunctionReturn(0); 1432 } 1433 1434 /* 1435 Create per-rank pack/unpack optimizations based on indice patterns 1436 1437 Input Parameters: 1438 + n - Number of destination ranks 1439 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1440 - idx - [*] Array storing indices 1441 1442 Output Parameters: 1443 + opt - Pack optimizations. NULL if no optimizations. 1444 */ 1445 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1446 { 1447 PetscErrorCode ierr; 1448 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1449 PetscBool optimizable = PETSC_TRUE; 1450 PetscSFPackOpt opt; 1451 1452 PetscFunctionBegin; 1453 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1454 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1455 opt->n = opt->array[0] = n; 1456 opt->offset = opt->array + 1; 1457 opt->start = opt->array + n + 2; 1458 opt->dx = opt->array + 2*n + 2; 1459 opt->dy = opt->array + 3*n + 2; 1460 opt->dz = opt->array + 4*n + 2; 1461 opt->X = opt->array + 5*n + 2; 1462 opt->Y = opt->array + 6*n + 2; 1463 1464 for (r=0; r<n; r++) { /* For each destination rank */ 1465 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1466 p = offset[r]; 1467 start = idx[p]; /* First index for this rank */ 1468 p++; 1469 1470 /* Search in X dimension */ 1471 for (dx=1; dx<m; dx++,p++) { 1472 if (start+dx != idx[p]) break; 1473 } 1474 1475 dydz = m/dx; 1476 X = dydz > 1 ? (idx[p]-start) : dx; 1477 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1478 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1479 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1480 for (i=0; i<dx; i++,p++) { 1481 if (start+X*dy+i != idx[p]) { 1482 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1483 else goto Z_dimension; 1484 } 1485 } 1486 } 1487 1488 Z_dimension: 1489 dz = m/(dx*dy); 1490 Y = dz > 1 ? (idx[p]-start)/X : dy; 1491 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1492 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1493 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1494 for (j=0; j<dy; j++) { 1495 for (i=0; i<dx; i++,p++) { 1496 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1497 } 1498 } 1499 } 1500 opt->start[r] = start; 1501 opt->dx[r] = dx; 1502 opt->dy[r] = dy; 1503 opt->dz[r] = dz; 1504 opt->X[r] = X; 1505 opt->Y[r] = Y; 1506 } 1507 1508 finish: 1509 /* If not optimizable, free arrays to save memory */ 1510 if (!n || !optimizable) { 1511 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1512 ierr = PetscFree(opt);CHKERRQ(ierr); 1513 *out = NULL; 1514 } else { 1515 opt->offset[0] = 0; 1516 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1517 *out = opt; 1518 } 1519 PetscFunctionReturn(0); 1520 } 1521 1522 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1523 { 1524 PetscErrorCode ierr; 1525 PetscSFPackOpt opt = *out; 1526 1527 PetscFunctionBegin; 1528 if (opt) { 1529 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1530 ierr = PetscFree(opt);CHKERRQ(ierr); 1531 *out = NULL; 1532 } 1533 PetscFunctionReturn(0); 1534 } 1535 1536 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1537 { 1538 PetscErrorCode ierr; 1539 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1540 PetscInt i,j; 1541 1542 PetscFunctionBegin; 1543 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1544 for (i=0; i<2; i++) { /* Set defaults */ 1545 sf->leafstart[i] = 0; 1546 sf->leafcontig[i] = PETSC_TRUE; 1547 sf->leafdups[i] = PETSC_FALSE; 1548 bas->rootstart[i] = 0; 1549 bas->rootcontig[i] = PETSC_TRUE; 1550 bas->rootdups[i] = PETSC_FALSE; 1551 } 1552 1553 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1554 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1555 1556 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1557 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1558 1559 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1560 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1561 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1562 } 1563 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1564 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1565 } 1566 1567 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1568 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1569 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1570 1571 /* Are root indices for self and remote contiguous? */ 1572 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1573 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1574 1575 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1576 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1577 1578 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1579 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1580 } 1581 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1582 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1583 } 1584 1585 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1586 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1587 1588 #if defined(PETSC_HAVE_DEVICE) 1589 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1590 if (PetscDefined(HAVE_DEVICE)) { 1591 PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1592 if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1593 if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1594 if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1595 if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1596 } 1597 #endif 1598 1599 PetscFunctionReturn(0); 1600 } 1601 1602 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1603 { 1604 PetscErrorCode ierr; 1605 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1606 PetscInt i; 1607 1608 PetscFunctionBegin; 1609 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1610 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1611 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1612 #if defined(PETSC_HAVE_DEVICE) 1613 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1614 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1615 #endif 1616 } 1617 PetscFunctionReturn(0); 1618 } 1619