1 #include "petsc/private/sfimpl.h" 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 6 7 #if defined(PETSC_HAVE_CUDA) 8 #include <cuda_runtime.h> 9 #include <petsccublas.h> 10 #endif 11 /* 12 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 13 * therefore we pack data types manually. This file defines packing routines for the standard data types. 14 */ 15 16 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 17 18 /* Operations working like s += t */ 19 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 20 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 21 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 22 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 23 /* Ref MPI MAXLOC */ 24 #define OP_XLOC(op,s,t) \ 25 do { \ 26 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 27 else if (!((s).u op (t).u)) s = t; \ 28 } while (0) 29 30 /* DEF_PackFunc - macro defining a Pack routine 31 32 Arguments of the macro: 33 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 34 .BS Block size for vectorization. It is a factor of bsz. 35 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 36 37 Arguments of the Pack routine: 38 +count Number of indices in idx[]. 39 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 40 .opt Per-pack optimization plan. NULL means no such plan. 41 .idx Indices of entries to packed. 42 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 43 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 44 -packed Address of the packed data. 45 */ 46 #define DEF_PackFunc(Type,BS,EQ) \ 47 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 48 { \ 49 PetscErrorCode ierr; \ 50 const Type *u = (const Type*)unpacked,*u2; \ 51 Type *p = (Type*)packed,*p2; \ 52 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 53 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 54 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 55 PetscFunctionBegin; \ 56 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 57 else if (opt) { /* has optimizations available */ \ 58 p2 = p; \ 59 for (r=0; r<opt->n; r++) { \ 60 u2 = u + opt->start[r]*MBS; \ 61 X = opt->X[r]; \ 62 Y = opt->Y[r]; \ 63 for (k=0; k<opt->dz[r]; k++) \ 64 for (j=0; j<opt->dy[r]; j++) { \ 65 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 66 p2 += opt->dx[r]*MBS; \ 67 } \ 68 } \ 69 } else { \ 70 for (i=0; i<count; i++) \ 71 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 72 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 73 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 74 } \ 75 PetscFunctionReturn(0); \ 76 } 77 78 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 79 and inserts into a sparse array. 80 81 Arguments: 82 .Type Type of the data 83 .BS Block size for vectorization 84 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 85 86 Notes: 87 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 88 */ 89 #define DEF_UnpackFunc(Type,BS,EQ) \ 90 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 91 { \ 92 PetscErrorCode ierr; \ 93 Type *u = (Type*)unpacked,*u2; \ 94 const Type *p = (const Type*)packed; \ 95 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 96 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 97 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 98 PetscFunctionBegin; \ 99 if (!idx) { \ 100 u += start*MBS; \ 101 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 102 } else if (opt) { /* has optimizations available */ \ 103 for (r=0; r<opt->n; r++) { \ 104 u2 = u + opt->start[r]*MBS; \ 105 X = opt->X[r]; \ 106 Y = opt->Y[r]; \ 107 for (k=0; k<opt->dz[r]; k++) \ 108 for (j=0; j<opt->dy[r]; j++) { \ 109 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 110 p += opt->dx[r]*MBS; \ 111 } \ 112 } \ 113 } else { \ 114 for (i=0; i<count; i++) \ 115 for (j=0; j<M; j++) \ 116 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 117 } \ 118 PetscFunctionReturn(0); \ 119 } 120 121 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 122 123 Arguments: 124 +Opname Name of the Op, such as Add, Mult, LAND, etc. 125 .Type Type of the data 126 .BS Block size for vectorization 127 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 128 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 129 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 130 */ 131 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 132 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 133 { \ 134 Type *u = (Type*)unpacked,*u2; \ 135 const Type *p = (const Type*)packed; \ 136 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 137 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 138 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 139 PetscFunctionBegin; \ 140 if (!idx) { \ 141 u += start*MBS; \ 142 for (i=0; i<count; i++) \ 143 for (j=0; j<M; j++) \ 144 for (k=0; k<BS; k++) \ 145 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 146 } else if (opt) { /* idx[] has patterns */ \ 147 for (r=0; r<opt->n; r++) { \ 148 u2 = u + opt->start[r]*MBS; \ 149 X = opt->X[r]; \ 150 Y = opt->Y[r]; \ 151 for (k=0; k<opt->dz[r]; k++) \ 152 for (j=0; j<opt->dy[r]; j++) { \ 153 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 154 p += opt->dx[r]*MBS; \ 155 } \ 156 } \ 157 } else { \ 158 for (i=0; i<count; i++) \ 159 for (j=0; j<M; j++) \ 160 for (k=0; k<BS; k++) \ 161 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 162 } \ 163 PetscFunctionReturn(0); \ 164 } 165 166 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 167 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 168 { \ 169 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 170 PetscInt i,j,k,r,l,bs=link->bs; \ 171 const PetscInt M = (EQ) ? 1 : bs/BS; \ 172 const PetscInt MBS = M*BS; \ 173 PetscFunctionBegin; \ 174 for (i=0; i<count; i++) { \ 175 r = (!idx ? start+i : idx[i])*MBS; \ 176 l = i*MBS; \ 177 for (j=0; j<M; j++) \ 178 for (k=0; k<BS; k++) { \ 179 tmp = u[r+j*BS+k]; \ 180 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 181 p[l+j*BS+k] = tmp; \ 182 } \ 183 } \ 184 PetscFunctionReturn(0); \ 185 } 186 187 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 188 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 189 { \ 190 PetscErrorCode ierr; \ 191 const Type *u = (const Type*)src; \ 192 Type *v = (Type*)dst; \ 193 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 194 const PetscInt M = (EQ) ? 1 : bs/BS; \ 195 const PetscInt MBS = M*BS; \ 196 PetscFunctionBegin; \ 197 if (!srcIdx) { /* src is contiguous */ \ 198 u += srcStart*MBS; \ 199 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 200 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 201 u += srcOpt->start[0]*MBS; \ 202 v += dstStart*MBS; \ 203 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 204 for (k=0; k<srcOpt->dz[0]; k++) \ 205 for (j=0; j<srcOpt->dy[0]; j++) { \ 206 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 207 v += srcOpt->dx[0]*MBS; \ 208 } \ 209 } else { /* all other cases */ \ 210 for (i=0; i<count; i++) { \ 211 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 212 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 213 for (j=0; j<M; j++) \ 214 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 215 } \ 216 } \ 217 PetscFunctionReturn(0); \ 218 } 219 220 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 221 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 222 { \ 223 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 224 const Type *ldata = (const Type*)leafdata; \ 225 PetscInt i,j,k,r,l,bs = link->bs; \ 226 const PetscInt M = (EQ) ? 1 : bs/BS; \ 227 const PetscInt MBS = M*BS; \ 228 PetscFunctionBegin; \ 229 for (i=0; i<count; i++) { \ 230 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 231 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 232 for (j=0; j<M; j++) \ 233 for (k=0; k<BS; k++) { \ 234 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 235 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 236 } \ 237 } \ 238 PetscFunctionReturn(0); \ 239 } 240 241 /* Pack, Unpack/Fetch ops */ 242 #define DEF_Pack(Type,BS,EQ) \ 243 DEF_PackFunc(Type,BS,EQ) \ 244 DEF_UnpackFunc(Type,BS,EQ) \ 245 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 246 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 247 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 248 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 249 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 250 } 251 252 /* Add, Mult ops */ 253 #define DEF_Add(Type,BS,EQ) \ 254 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 255 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 256 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 257 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 258 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 259 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 260 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 261 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 262 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 263 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 264 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 265 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 266 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 267 } 268 269 /* Max, Min ops */ 270 #define DEF_Cmp(Type,BS,EQ) \ 271 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 272 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 273 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 274 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 275 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 276 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 277 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 278 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 279 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 280 } 281 282 /* Logical ops. 283 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 284 the compilation warning "empty macro arguments are undefined in ISO C90" 285 */ 286 #define DEF_Log(Type,BS,EQ) \ 287 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 288 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 289 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 290 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 291 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 292 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 293 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 294 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 295 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 296 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 297 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 298 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 299 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 300 } 301 302 /* Bitwise ops */ 303 #define DEF_Bit(Type,BS,EQ) \ 304 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 305 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 306 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 307 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 308 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 309 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 310 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 311 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 312 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 313 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 314 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 315 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 316 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 317 } 318 319 /* Maxloc, Minloc ops */ 320 #define DEF_Xloc(Type,BS,EQ) \ 321 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 322 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 323 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 324 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 325 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 326 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 327 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 328 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 329 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 330 } 331 332 #define DEF_IntegerType(Type,BS,EQ) \ 333 DEF_Pack(Type,BS,EQ) \ 334 DEF_Add(Type,BS,EQ) \ 335 DEF_Cmp(Type,BS,EQ) \ 336 DEF_Log(Type,BS,EQ) \ 337 DEF_Bit(Type,BS,EQ) \ 338 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 339 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 340 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 341 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 342 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 343 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 344 } 345 346 #define DEF_RealType(Type,BS,EQ) \ 347 DEF_Pack(Type,BS,EQ) \ 348 DEF_Add(Type,BS,EQ) \ 349 DEF_Cmp(Type,BS,EQ) \ 350 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 351 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 352 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 353 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 354 } 355 356 #if defined(PETSC_HAVE_COMPLEX) 357 #define DEF_ComplexType(Type,BS,EQ) \ 358 DEF_Pack(Type,BS,EQ) \ 359 DEF_Add(Type,BS,EQ) \ 360 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 361 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 362 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 363 } 364 #endif 365 366 #define DEF_DumbType(Type,BS,EQ) \ 367 DEF_Pack(Type,BS,EQ) \ 368 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 369 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 370 } 371 372 /* Maxloc, Minloc */ 373 #define DEF_PairType(Type,BS,EQ) \ 374 DEF_Pack(Type,BS,EQ) \ 375 DEF_Xloc(Type,BS,EQ) \ 376 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 377 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 378 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 379 } 380 381 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 382 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 383 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 384 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 385 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 386 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 387 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 388 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 389 390 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 391 DEF_IntegerType(int,1,1) 392 DEF_IntegerType(int,2,1) 393 DEF_IntegerType(int,4,1) 394 DEF_IntegerType(int,8,1) 395 DEF_IntegerType(int,1,0) 396 DEF_IntegerType(int,2,0) 397 DEF_IntegerType(int,4,0) 398 DEF_IntegerType(int,8,0) 399 #endif 400 401 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 402 typedef signed char SignedChar; 403 DEF_IntegerType(SignedChar,1,1) 404 DEF_IntegerType(SignedChar,2,1) 405 DEF_IntegerType(SignedChar,4,1) 406 DEF_IntegerType(SignedChar,8,1) 407 DEF_IntegerType(SignedChar,1,0) 408 DEF_IntegerType(SignedChar,2,0) 409 DEF_IntegerType(SignedChar,4,0) 410 DEF_IntegerType(SignedChar,8,0) 411 412 typedef unsigned char UnsignedChar; 413 DEF_IntegerType(UnsignedChar,1,1) 414 DEF_IntegerType(UnsignedChar,2,1) 415 DEF_IntegerType(UnsignedChar,4,1) 416 DEF_IntegerType(UnsignedChar,8,1) 417 DEF_IntegerType(UnsignedChar,1,0) 418 DEF_IntegerType(UnsignedChar,2,0) 419 DEF_IntegerType(UnsignedChar,4,0) 420 DEF_IntegerType(UnsignedChar,8,0) 421 422 DEF_RealType(PetscReal,1,1) 423 DEF_RealType(PetscReal,2,1) 424 DEF_RealType(PetscReal,4,1) 425 DEF_RealType(PetscReal,8,1) 426 DEF_RealType(PetscReal,1,0) 427 DEF_RealType(PetscReal,2,0) 428 DEF_RealType(PetscReal,4,0) 429 DEF_RealType(PetscReal,8,0) 430 431 #if defined(PETSC_HAVE_COMPLEX) 432 DEF_ComplexType(PetscComplex,1,1) 433 DEF_ComplexType(PetscComplex,2,1) 434 DEF_ComplexType(PetscComplex,4,1) 435 DEF_ComplexType(PetscComplex,8,1) 436 DEF_ComplexType(PetscComplex,1,0) 437 DEF_ComplexType(PetscComplex,2,0) 438 DEF_ComplexType(PetscComplex,4,0) 439 DEF_ComplexType(PetscComplex,8,0) 440 #endif 441 442 #define PairType(Type1,Type2) Type1##_##Type2 443 typedef struct {int u; int i;} PairType(int,int); 444 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 445 DEF_PairType(PairType(int,int),1,1) 446 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 447 448 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 449 DEF_DumbType(char,1,1) 450 DEF_DumbType(char,2,1) 451 DEF_DumbType(char,4,1) 452 DEF_DumbType(char,1,0) 453 DEF_DumbType(char,2,0) 454 DEF_DumbType(char,4,0) 455 456 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 457 DEF_DumbType(DumbInt,1,1) 458 DEF_DumbType(DumbInt,2,1) 459 DEF_DumbType(DumbInt,4,1) 460 DEF_DumbType(DumbInt,8,1) 461 DEF_DumbType(DumbInt,1,0) 462 DEF_DumbType(DumbInt,2,0) 463 DEF_DumbType(DumbInt,4,0) 464 DEF_DumbType(DumbInt,8,0) 465 466 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 467 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 468 { 469 int ierr; 470 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 471 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 472 return MPI_SUCCESS; 473 } 474 #endif 475 476 /* 477 The routine Creates a communication link for the given operation. It first looks up its link cache. If 478 there is a free & suitable one, it uses it. Otherwise it creates a new one. 479 480 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 481 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 482 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 483 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 484 485 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 486 487 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 488 489 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 490 need pack/unpack data. 491 */ 492 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 493 { 494 PetscErrorCode ierr; 495 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 496 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 497 PetscSFLink *p,link; 498 PetscSFDirection direction; 499 MPI_Request *reqs = NULL; 500 PetscBool match,rootdirect[2],leafdirect[2]; 501 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 502 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 503 504 PetscFunctionBegin; 505 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 506 507 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 508 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 509 if (sfop == PETSCSF_BCAST) { 510 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 511 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 512 } else if (sfop == PETSCSF_REDUCE) { 513 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 514 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 515 } else { /* PETSCSF_FETCH */ 516 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 517 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 518 } 519 } 520 521 if (sf->use_gpu_aware_mpi) { 522 rootmtype_mpi = rootmtype; 523 leafmtype_mpi = leafmtype; 524 } else { 525 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 526 } 527 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 528 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 529 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 530 531 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 532 nrootreqs = bas->nrootreqs; 533 nleafreqs = sf->nleafreqs; 534 535 /* Look for free links in cache */ 536 for (p=&bas->avail; (link=*p); p=&link->next) { 537 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 538 if (match) { 539 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 540 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 541 */ 542 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 543 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 544 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 545 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 546 } 547 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 548 reqs = link->leafreqs[direction][leafmtype][1]; 549 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 550 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 551 } 552 *p = link->next; /* Remove from available list */ 553 goto found; 554 } 555 } 556 557 ierr = PetscNew(&link);CHKERRQ(ierr); 558 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 559 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 560 561 nreqs = (nrootreqs+nleafreqs)*8; 562 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 563 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 564 565 for (i=0; i<2; i++) { /* Two communication directions */ 566 for (j=0; j<2; j++) { /* Two memory types */ 567 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 568 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 569 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 570 } 571 } 572 } 573 574 found: 575 576 #if defined(PETSC_HAVE_DEVICE) 577 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) { 578 #if defined(PETSC_HAVE_CUDA) 579 if (sf->backend == PETSCSF_BACKEND_CUDA) {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);} 580 #endif 581 #if defined(PETSC_HAVE_KOKKOS) 582 if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);} 583 #endif 584 } 585 #endif 586 587 /* Allocate buffers along root/leafdata */ 588 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 589 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 590 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 591 if (bas->rootbuflen[i]) { 592 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 593 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 594 } else { /* Have to have a separate rootbuf */ 595 if (!link->rootbuf_alloc[i][rootmtype]) { 596 ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 597 } 598 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 599 } 600 } 601 602 if (sf->leafbuflen[i]) { 603 if (leafdirect[i]) { 604 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 605 } else { 606 if (!link->leafbuf_alloc[i][leafmtype]) { 607 ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 608 } 609 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 610 } 611 } 612 } 613 614 #if defined(PETSC_HAVE_DEVICE) 615 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 616 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 617 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 618 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 619 } 620 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 621 } 622 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 623 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 624 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 625 } 626 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 627 } 628 #endif 629 630 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 631 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 632 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 633 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 634 } 635 636 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 637 link->leafdata = leafdata; 638 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 639 link->rootdirect[i] = rootdirect[i]; 640 link->leafdirect[i] = leafdirect[i]; 641 } 642 link->rootdirect_mpi = rootdirect_mpi; 643 link->leafdirect_mpi = leafdirect_mpi; 644 link->rootmtype = rootmtype; 645 link->leafmtype = leafmtype; 646 link->rootmtype_mpi = rootmtype_mpi; 647 link->leafmtype_mpi = leafmtype_mpi; 648 649 link->next = bas->inuse; 650 bas->inuse = link; 651 *mylink = link; 652 PetscFunctionReturn(0); 653 } 654 655 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 656 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 657 */ 658 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 659 { 660 PetscErrorCode ierr; 661 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 662 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 663 const PetscInt *rootoffset,*leafoffset; 664 PetscMPIInt n; 665 MPI_Aint disp; 666 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 667 MPI_Datatype unit = link->unit; 668 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 669 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 670 671 PetscFunctionBegin; 672 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 673 if (sf->persistent) { 674 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 675 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 676 if (direction == PETSCSF_LEAF2ROOT) { 677 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 678 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 679 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 680 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 681 } 682 } else { /* PETSCSF_ROOT2LEAF */ 683 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 684 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 685 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 686 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 687 } 688 } 689 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 690 } 691 692 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 693 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 694 if (direction == PETSCSF_LEAF2ROOT) { 695 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 696 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 697 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 698 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 699 } 700 } else { /* PETSCSF_ROOT2LEAF */ 701 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 702 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 703 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 704 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 705 } 706 } 707 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 708 } 709 } 710 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 711 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 712 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 713 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 714 PetscFunctionReturn(0); 715 } 716 717 718 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 719 { 720 PetscErrorCode ierr; 721 PetscSFLink link,*p; 722 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 723 724 PetscFunctionBegin; 725 /* Look for types in cache */ 726 for (p=&bas->inuse; (link=*p); p=&link->next) { 727 PetscBool match; 728 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 729 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 730 switch (cmode) { 731 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 732 case PETSC_USE_POINTER: break; 733 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 734 } 735 *mylink = link; 736 PetscFunctionReturn(0); 737 } 738 } 739 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 740 PetscFunctionReturn(0); 741 } 742 743 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 744 { 745 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 746 747 PetscFunctionBegin; 748 (*link)->rootdata = NULL; 749 (*link)->leafdata = NULL; 750 (*link)->next = bas->avail; 751 bas->avail = *link; 752 *link = NULL; 753 PetscFunctionReturn(0); 754 } 755 756 /* Destroy all links chained in 'avail' */ 757 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 758 { 759 PetscErrorCode ierr; 760 PetscSFLink link = *avail,next; 761 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 762 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 763 764 PetscFunctionBegin; 765 for (; link; link=next) { 766 next = link->next; 767 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 768 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 769 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 770 } 771 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 772 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 773 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 774 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 775 #if defined(PETSC_HAVE_DEVICE) 776 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 777 ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 778 #endif 779 } 780 #if defined(PETSC_HAVE_DEVICE) 781 if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);} 782 #if defined(PETSC_HAVE_CUDA) 783 if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;} 784 #elif defined(PETSC_HAVE_HIP) 785 if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */ 786 #endif 787 #endif 788 ierr = PetscFree(link);CHKERRQ(ierr); 789 } 790 *avail = NULL; 791 PetscFunctionReturn(0); 792 } 793 794 /* Error out on unsupported overlapped communications */ 795 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 796 { 797 PetscErrorCode ierr; 798 PetscSFLink link,*p; 799 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 800 PetscBool match; 801 802 PetscFunctionBegin; 803 if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0); 804 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 805 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 806 */ 807 if (rootdata || leafdata) { 808 for (p=&bas->inuse; (link=*p); p=&link->next) { 809 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 810 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 811 } 812 } 813 PetscFunctionReturn(0); 814 } 815 816 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 817 { 818 PetscFunctionBegin; 819 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 820 PetscFunctionReturn(0); 821 } 822 823 824 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 825 { 826 PetscErrorCode ierr; 827 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 828 PetscBool is2Int,is2PetscInt; 829 PetscMPIInt ni,na,nd,combiner; 830 #if defined(PETSC_HAVE_COMPLEX) 831 PetscInt nPetscComplex=0; 832 #endif 833 834 PetscFunctionBegin; 835 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 836 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 837 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 838 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 839 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 840 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 841 #if defined(PETSC_HAVE_COMPLEX) 842 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 843 #endif 844 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 845 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 846 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 847 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 848 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 849 link->bs = 1; /* default */ 850 851 if (is2Int) { 852 PackInit_PairType_int_int_1_1(link); 853 link->bs = 1; 854 link->unitbytes = 2*sizeof(int); 855 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 856 link->basicunit = MPI_2INT; 857 link->unit = MPI_2INT; 858 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 859 PackInit_PairType_PetscInt_PetscInt_1_1(link); 860 link->bs = 1; 861 link->unitbytes = 2*sizeof(PetscInt); 862 link->basicunit = MPIU_2INT; 863 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 864 link->unit = MPIU_2INT; 865 } else if (nPetscReal) { 866 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 867 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 868 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 869 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 870 link->bs = nPetscReal; 871 link->unitbytes = nPetscReal*sizeof(PetscReal); 872 link->basicunit = MPIU_REAL; 873 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 874 } else if (nPetscInt) { 875 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 876 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 877 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 878 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 879 link->bs = nPetscInt; 880 link->unitbytes = nPetscInt*sizeof(PetscInt); 881 link->basicunit = MPIU_INT; 882 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 883 #if defined(PETSC_USE_64BIT_INDICES) 884 } else if (nInt) { 885 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 886 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 887 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 888 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 889 link->bs = nInt; 890 link->unitbytes = nInt*sizeof(int); 891 link->basicunit = MPI_INT; 892 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 893 #endif 894 } else if (nSignedChar) { 895 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 896 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 897 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 898 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 899 link->bs = nSignedChar; 900 link->unitbytes = nSignedChar*sizeof(SignedChar); 901 link->basicunit = MPI_SIGNED_CHAR; 902 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 903 } else if (nUnsignedChar) { 904 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 905 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 906 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 907 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 908 link->bs = nUnsignedChar; 909 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 910 link->basicunit = MPI_UNSIGNED_CHAR; 911 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 912 #if defined(PETSC_HAVE_COMPLEX) 913 } else if (nPetscComplex) { 914 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 915 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 916 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 917 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 918 link->bs = nPetscComplex; 919 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 920 link->basicunit = MPIU_COMPLEX; 921 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 922 #endif 923 } else { 924 MPI_Aint lb,nbyte; 925 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 926 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 927 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 928 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 929 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 930 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 931 link->bs = nbyte; 932 link->unitbytes = nbyte; 933 link->basicunit = MPI_BYTE; 934 } else { 935 nInt = nbyte / sizeof(int); 936 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 937 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 938 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 939 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 940 link->bs = nInt; 941 link->unitbytes = nbyte; 942 link->basicunit = MPI_INT; 943 } 944 if (link->isbuiltin) link->unit = unit; 945 } 946 947 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 948 949 link->Memcpy = PetscSFLinkMemcpy_Host; 950 PetscFunctionReturn(0); 951 } 952 953 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 954 { 955 PetscFunctionBegin; 956 *UnpackAndOp = NULL; 957 if (mtype == PETSC_MEMTYPE_HOST) { 958 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 959 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 960 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 961 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 962 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 963 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 964 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 965 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 966 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 967 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 968 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 969 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 970 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 971 } 972 #if defined(PETSC_HAVE_DEVICE) 973 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 974 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 975 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 976 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 977 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 978 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 979 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 980 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 981 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 982 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 983 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 984 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 985 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 986 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 987 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 988 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 989 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 990 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 991 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 992 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 993 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 994 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 995 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 996 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 997 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 998 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 999 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1000 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1001 } 1002 #endif 1003 PetscFunctionReturn(0); 1004 } 1005 1006 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 1007 { 1008 PetscFunctionBegin; 1009 *ScatterAndOp = NULL; 1010 if (mtype == PETSC_MEMTYPE_HOST) { 1011 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1012 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1013 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1014 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1015 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1016 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1017 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1018 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1019 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1020 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1021 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1022 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1023 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1024 } 1025 #if defined(PETSC_HAVE_DEVICE) 1026 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1027 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1028 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1029 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1030 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1031 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1032 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1033 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1034 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1035 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1036 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1037 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1038 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1039 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1040 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1041 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1042 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1043 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1044 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1045 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1046 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1047 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1048 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1049 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1050 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1051 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1052 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1053 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1054 } 1055 #endif 1056 PetscFunctionReturn(0); 1057 } 1058 1059 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1060 { 1061 PetscFunctionBegin; 1062 *FetchAndOp = NULL; 1063 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1064 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1065 #if defined(PETSC_HAVE_DEVICE) 1066 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1067 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1068 #endif 1069 PetscFunctionReturn(0); 1070 } 1071 1072 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1073 { 1074 PetscFunctionBegin; 1075 *FetchAndOpLocal = NULL; 1076 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1077 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1078 #if defined(PETSC_HAVE_DEVICE) 1079 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1080 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1081 #endif 1082 PetscFunctionReturn(0); 1083 } 1084 1085 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1086 { 1087 PetscErrorCode ierr; 1088 PetscLogDouble flops; 1089 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1090 1091 1092 PetscFunctionBegin; 1093 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1094 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1095 #if defined(PETSC_HAVE_DEVICE) 1096 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1097 #endif 1098 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1099 } 1100 PetscFunctionReturn(0); 1101 } 1102 1103 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1104 { 1105 PetscLogDouble flops; 1106 PetscErrorCode ierr; 1107 1108 PetscFunctionBegin; 1109 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1110 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1111 #if defined(PETSC_HAVE_DEVICE) 1112 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1113 #endif 1114 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1115 } 1116 PetscFunctionReturn(0); 1117 } 1118 1119 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1120 Input Arguments: 1121 +sf - The StarForest 1122 .link - The link 1123 .count - Number of entries to unpack 1124 .start - The first index, significent when indices=NULL 1125 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1126 .buf - A contiguous buffer to unpack from 1127 -op - Operation after unpack 1128 1129 Output Arguments: 1130 .data - The data to unpack to 1131 */ 1132 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1133 { 1134 PetscFunctionBegin; 1135 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1136 { 1137 PetscErrorCode ierr; 1138 PetscInt i; 1139 PetscMPIInt n; 1140 if (indices) { 1141 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1142 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1143 */ 1144 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1145 } else { 1146 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1147 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1148 } 1149 } 1150 #else 1151 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1152 #endif 1153 PetscFunctionReturn(0); 1154 } 1155 1156 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1157 { 1158 PetscFunctionBegin; 1159 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1160 { 1161 PetscErrorCode ierr; 1162 PetscInt i,disp; 1163 if (!srcIdx) { 1164 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1165 } else { 1166 for (i=0; i<count; i++) { 1167 disp = dstIdx? dstIdx[i] : dstStart + i; 1168 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1169 } 1170 } 1171 } 1172 #else 1173 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1174 #endif 1175 PetscFunctionReturn(0); 1176 } 1177 1178 /*============================================================================= 1179 Pack/Unpack/Fetch/Scatter routines 1180 ============================================================================*/ 1181 1182 /* Pack rootdata to rootbuf 1183 Input Arguments: 1184 + sf - The SF this packing works on. 1185 . link - It gives the memtype of the roots and also provides root buffer. 1186 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1187 - rootdata - Where to read the roots. 1188 1189 Notes: 1190 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1191 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1192 */ 1193 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1194 { 1195 PetscErrorCode ierr; 1196 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1197 const PetscInt *rootindices = NULL; 1198 PetscInt count,start; 1199 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1200 PetscMemType rootmtype = link->rootmtype; 1201 PetscSFPackOpt opt = NULL; 1202 1203 PetscFunctionBegin; 1204 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1205 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1206 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1207 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1208 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1209 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1210 } 1211 if (scope == PETSCSF_REMOTE) { 1212 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1213 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1214 } 1215 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1216 PetscFunctionReturn(0); 1217 } 1218 1219 /* Pack leafdata to leafbuf */ 1220 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1221 { 1222 PetscErrorCode ierr; 1223 const PetscInt *leafindices = NULL; 1224 PetscInt count,start; 1225 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1226 PetscMemType leafmtype = link->leafmtype; 1227 PetscSFPackOpt opt = NULL; 1228 1229 PetscFunctionBegin; 1230 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1231 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1232 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1233 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1234 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1235 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1236 } 1237 if (scope == PETSCSF_REMOTE) { 1238 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1239 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1240 } 1241 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1242 PetscFunctionReturn(0); 1243 } 1244 1245 /* Unpack rootbuf to rootdata */ 1246 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1247 { 1248 PetscErrorCode ierr; 1249 const PetscInt *rootindices = NULL; 1250 PetscInt count,start; 1251 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1252 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1253 PetscMemType rootmtype = link->rootmtype; 1254 PetscSFPackOpt opt = NULL; 1255 1256 PetscFunctionBegin; 1257 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1258 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1259 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1260 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1261 if (UnpackAndOp) { 1262 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1263 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1264 } else { 1265 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1266 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1267 } 1268 } 1269 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1270 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1271 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1272 PetscFunctionReturn(0); 1273 } 1274 1275 /* Unpack leafbuf to leafdata */ 1276 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1277 { 1278 PetscErrorCode ierr; 1279 const PetscInt *leafindices = NULL; 1280 PetscInt count,start; 1281 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1282 PetscMemType leafmtype = link->leafmtype; 1283 PetscSFPackOpt opt = NULL; 1284 1285 PetscFunctionBegin; 1286 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1287 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1288 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1289 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1290 if (UnpackAndOp) { 1291 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1292 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1293 } else { 1294 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1295 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1296 } 1297 } 1298 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1299 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1300 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1301 PetscFunctionReturn(0); 1302 } 1303 1304 /* FetchAndOp rootdata with rootbuf */ 1305 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1306 { 1307 PetscErrorCode ierr; 1308 const PetscInt *rootindices = NULL; 1309 PetscInt count,start; 1310 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1311 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1312 PetscMemType rootmtype = link->rootmtype; 1313 PetscSFPackOpt opt = NULL; 1314 1315 PetscFunctionBegin; 1316 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1317 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1318 if (bas->rootbuflen[scope]) { 1319 /* Do FetchAndOp on rootdata with rootbuf */ 1320 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1321 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1322 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1323 } 1324 if (scope == PETSCSF_REMOTE) { 1325 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1326 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1327 } 1328 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1329 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1330 PetscFunctionReturn(0); 1331 } 1332 1333 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1334 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1335 { 1336 PetscErrorCode ierr; 1337 const PetscInt *rootindices = NULL,*leafindices = NULL; 1338 PetscInt count,rootstart,leafstart; 1339 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1340 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1341 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1342 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1343 1344 PetscFunctionBegin; 1345 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1346 if (rootmtype != leafmtype) { /* Uncommon case */ 1347 /* The local communication has to go through pack and unpack */ 1348 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1349 ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1350 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1351 } else { 1352 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1353 if (ScatterAndOp) { 1354 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1355 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1356 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1357 } else { 1358 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1359 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1360 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1361 } 1362 } 1363 PetscFunctionReturn(0); 1364 } 1365 1366 /* Reduce leafdata to rootdata locally */ 1367 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1368 { 1369 PetscErrorCode ierr; 1370 const PetscInt *rootindices = NULL,*leafindices = NULL; 1371 PetscInt count,rootstart,leafstart; 1372 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1373 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1374 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1375 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1376 1377 PetscFunctionBegin; 1378 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1379 if (rootmtype != leafmtype) { 1380 /* The local communication has to go through pack and unpack */ 1381 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1382 ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1383 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1384 } else { 1385 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1386 if (ScatterAndOp) { 1387 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1388 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1389 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1390 } else { 1391 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1392 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1393 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1394 } 1395 } 1396 PetscFunctionReturn(0); 1397 } 1398 1399 /* Fetch rootdata to leafdata and leafupdate locally */ 1400 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1401 { 1402 PetscErrorCode ierr; 1403 const PetscInt *rootindices = NULL,*leafindices = NULL; 1404 PetscInt count,rootstart,leafstart; 1405 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1406 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1407 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1408 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1409 1410 PetscFunctionBegin; 1411 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1412 if (rootmtype != leafmtype) { 1413 /* The local communication has to go through pack and unpack */ 1414 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1415 } else { 1416 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1417 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1418 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1419 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1420 } 1421 PetscFunctionReturn(0); 1422 } 1423 1424 /* 1425 Create per-rank pack/unpack optimizations based on indice patterns 1426 1427 Input Parameters: 1428 + n - Number of destination ranks 1429 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1430 - idx - [*] Array storing indices 1431 1432 Output Parameters: 1433 + opt - Pack optimizations. NULL if no optimizations. 1434 */ 1435 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1436 { 1437 PetscErrorCode ierr; 1438 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1439 PetscBool optimizable = PETSC_TRUE; 1440 PetscSFPackOpt opt; 1441 1442 PetscFunctionBegin; 1443 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1444 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1445 opt->n = opt->array[0] = n; 1446 opt->offset = opt->array + 1; 1447 opt->start = opt->array + n + 2; 1448 opt->dx = opt->array + 2*n + 2; 1449 opt->dy = opt->array + 3*n + 2; 1450 opt->dz = opt->array + 4*n + 2; 1451 opt->X = opt->array + 5*n + 2; 1452 opt->Y = opt->array + 6*n + 2; 1453 1454 for (r=0; r<n; r++) { /* For each destination rank */ 1455 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1456 p = offset[r]; 1457 start = idx[p]; /* First index for this rank */ 1458 p++; 1459 1460 /* Search in X dimension */ 1461 for (dx=1; dx<m; dx++,p++) { 1462 if (start+dx != idx[p]) break; 1463 } 1464 1465 dydz = m/dx; 1466 X = dydz > 1 ? (idx[p]-start) : dx; 1467 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1468 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1469 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1470 for (i=0; i<dx; i++,p++) { 1471 if (start+X*dy+i != idx[p]) { 1472 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1473 else goto Z_dimension; 1474 } 1475 } 1476 } 1477 1478 Z_dimension: 1479 dz = m/(dx*dy); 1480 Y = dz > 1 ? (idx[p]-start)/X : dy; 1481 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1482 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1483 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1484 for (j=0; j<dy; j++) { 1485 for (i=0; i<dx; i++,p++) { 1486 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1487 } 1488 } 1489 } 1490 opt->start[r] = start; 1491 opt->dx[r] = dx; 1492 opt->dy[r] = dy; 1493 opt->dz[r] = dz; 1494 opt->X[r] = X; 1495 opt->Y[r] = Y; 1496 } 1497 1498 finish: 1499 /* If not optimizable, free arrays to save memory */ 1500 if (!n || !optimizable) { 1501 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1502 ierr = PetscFree(opt);CHKERRQ(ierr); 1503 *out = NULL; 1504 } else { 1505 opt->offset[0] = 0; 1506 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1507 *out = opt; 1508 } 1509 PetscFunctionReturn(0); 1510 } 1511 1512 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 1513 { 1514 PetscErrorCode ierr; 1515 PetscSFPackOpt opt = *out; 1516 1517 PetscFunctionBegin; 1518 if (opt) { 1519 ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 1520 ierr = PetscFree(opt);CHKERRQ(ierr); 1521 *out = NULL; 1522 } 1523 PetscFunctionReturn(0); 1524 } 1525 1526 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1527 { 1528 PetscErrorCode ierr; 1529 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1530 PetscInt i,j; 1531 1532 PetscFunctionBegin; 1533 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1534 for (i=0; i<2; i++) { /* Set defaults */ 1535 sf->leafstart[i] = 0; 1536 sf->leafcontig[i] = PETSC_TRUE; 1537 sf->leafdups[i] = PETSC_FALSE; 1538 bas->rootstart[i] = 0; 1539 bas->rootcontig[i] = PETSC_TRUE; 1540 bas->rootdups[i] = PETSC_FALSE; 1541 } 1542 1543 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1544 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1545 1546 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1547 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1548 1549 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1550 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1551 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1552 } 1553 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1554 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1555 } 1556 1557 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1558 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1559 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1560 1561 /* Are root indices for self and remote contiguous? */ 1562 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1563 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1564 1565 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1566 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1567 1568 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1569 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1570 } 1571 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1572 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1573 } 1574 1575 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1576 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1577 1578 #if defined(PETSC_HAVE_DEVICE) 1579 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1580 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1581 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1582 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1583 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1584 #endif 1585 1586 PetscFunctionReturn(0); 1587 } 1588 1589 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1590 { 1591 PetscErrorCode ierr; 1592 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1593 PetscInt i; 1594 1595 PetscFunctionBegin; 1596 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1597 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1598 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1599 #if defined(PETSC_HAVE_DEVICE) 1600 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1601 ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1602 #endif 1603 } 1604 PetscFunctionReturn(0); 1605 } 1606