1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> 7 #endif 8 /* 9 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 10 * therefore we pack data types manually. This file defines packing routines for the standard data types. 11 */ 12 13 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 14 15 /* Operations working like s += t */ 16 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 17 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 18 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 19 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 20 /* Ref MPI MAXLOC */ 21 #define OP_XLOC(op,s,t) \ 22 do { \ 23 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 24 else if (!((s).u op (t).u)) s = t; \ 25 } while(0) 26 27 /* DEF_PackFunc - macro defining a Pack routine 28 29 Arguments of the macro: 30 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 31 .BS Block size for vectorization. It is a factor of bsz. 32 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 33 34 Arguments of the Pack routine: 35 +count Number of indices in idx[]. 36 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 37 .opt Per-pack optimization plan. NULL means no such plan. 38 .idx Indices of entries to packed. 39 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 40 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 41 -packed Address of the packed data. 42 */ 43 #define DEF_PackFunc(Type,BS,EQ) \ 44 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 45 { \ 46 PetscErrorCode ierr; \ 47 const Type *u = (const Type*)unpacked,*u2; \ 48 Type *p = (Type*)packed,*p2; \ 49 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 50 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 52 PetscFunctionBegin; \ 53 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 54 else if (opt) { /* has optimizations available */ \ 55 p2 = p; \ 56 for (r=0; r<opt->n; r++) { \ 57 u2 = u + opt->start[r]*MBS; \ 58 X = opt->X[r]; \ 59 Y = opt->Y[r]; \ 60 for (k=0; k<opt->dz[r]; k++) \ 61 for (j=0; j<opt->dy[r]; j++) { \ 62 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 63 p2 += opt->dx[r]*MBS; \ 64 } \ 65 } \ 66 } else { \ 67 for (i=0; i<count; i++) \ 68 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 69 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 70 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 71 } \ 72 PetscFunctionReturn(0); \ 73 } 74 75 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 76 and inserts into a sparse array. 77 78 Arguments: 79 .Type Type of the data 80 .BS Block size for vectorization 81 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 82 83 Notes: 84 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 85 */ 86 #define DEF_UnpackFunc(Type,BS,EQ) \ 87 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 88 { \ 89 PetscErrorCode ierr; \ 90 Type *u = (Type*)unpacked,*u2; \ 91 const Type *p = (const Type*)packed; \ 92 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 93 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 94 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 95 PetscFunctionBegin; \ 96 if (!idx) { \ 97 u += start*MBS; \ 98 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 99 } else if (opt) { /* has optimizations available */ \ 100 for (r=0; r<opt->n; r++) { \ 101 u2 = u + opt->start[r]*MBS; \ 102 X = opt->X[r]; \ 103 Y = opt->Y[r]; \ 104 for (k=0; k<opt->dz[r]; k++) \ 105 for (j=0; j<opt->dy[r]; j++) { \ 106 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 107 p += opt->dx[r]*MBS; \ 108 } \ 109 } \ 110 } else { \ 111 for (i=0; i<count; i++) \ 112 for (j=0; j<M; j++) \ 113 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 114 } \ 115 PetscFunctionReturn(0); \ 116 } 117 118 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 119 120 Arguments: 121 +Opname Name of the Op, such as Add, Mult, LAND, etc. 122 .Type Type of the data 123 .BS Block size for vectorization 124 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 125 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 126 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 127 */ 128 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 129 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 130 { \ 131 Type *u = (Type*)unpacked,*u2; \ 132 const Type *p = (const Type*)packed; \ 133 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 134 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 135 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 136 PetscFunctionBegin; \ 137 if (!idx) { \ 138 u += start*MBS; \ 139 for (i=0; i<count; i++) \ 140 for (j=0; j<M; j++) \ 141 for (k=0; k<BS; k++) \ 142 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 143 } else if (opt) { /* idx[] has patterns */ \ 144 for (r=0; r<opt->n; r++) { \ 145 u2 = u + opt->start[r]*MBS; \ 146 X = opt->X[r]; \ 147 Y = opt->Y[r]; \ 148 for (k=0; k<opt->dz[r]; k++) \ 149 for (j=0; j<opt->dy[r]; j++) { \ 150 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 151 p += opt->dx[r]*MBS; \ 152 } \ 153 } \ 154 } else { \ 155 for (i=0; i<count; i++) \ 156 for (j=0; j<M; j++) \ 157 for (k=0; k<BS; k++) \ 158 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 159 } \ 160 PetscFunctionReturn(0); \ 161 } 162 163 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 164 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 165 { \ 166 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 167 PetscInt i,j,k,r,l,bs=link->bs; \ 168 const PetscInt M = (EQ) ? 1 : bs/BS; \ 169 const PetscInt MBS = M*BS; \ 170 PetscFunctionBegin; \ 171 for (i=0; i<count; i++) { \ 172 r = (!idx ? start+i : idx[i])*MBS; \ 173 l = i*MBS; \ 174 for (j=0; j<M; j++) \ 175 for (k=0; k<BS; k++) { \ 176 tmp = u[r+j*BS+k]; \ 177 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 178 p[l+j*BS+k] = tmp; \ 179 } \ 180 } \ 181 PetscFunctionReturn(0); \ 182 } 183 184 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 185 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 186 { \ 187 PetscErrorCode ierr; \ 188 const Type *u = (const Type*)src; \ 189 Type *v = (Type*)dst; \ 190 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 191 const PetscInt M = (EQ) ? 1 : bs/BS; \ 192 const PetscInt MBS = M*BS; \ 193 PetscFunctionBegin; \ 194 if (!srcIdx) { /* src is contiguous */ \ 195 u += srcStart*MBS; \ 196 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 197 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 198 u += srcOpt->start[0]*MBS; \ 199 v += dstStart*MBS; \ 200 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 201 for (k=0; k<srcOpt->dz[0]; k++) \ 202 for (j=0; j<srcOpt->dy[0]; j++) { \ 203 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 204 v += srcOpt->dx[0]*MBS; \ 205 } \ 206 } else { /* all other cases */ \ 207 for (i=0; i<count; i++) { \ 208 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 209 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 210 for (j=0; j<M; j++) \ 211 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 212 } \ 213 } \ 214 PetscFunctionReturn(0); \ 215 } 216 217 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 218 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 219 { \ 220 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 221 const Type *ldata = (const Type*)leafdata; \ 222 PetscInt i,j,k,r,l,bs = link->bs; \ 223 const PetscInt M = (EQ) ? 1 : bs/BS; \ 224 const PetscInt MBS = M*BS; \ 225 PetscFunctionBegin; \ 226 for (i=0; i<count; i++) { \ 227 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 228 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 229 for (j=0; j<M; j++) \ 230 for (k=0; k<BS; k++) { \ 231 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 232 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 233 } \ 234 } \ 235 PetscFunctionReturn(0); \ 236 } 237 238 /* Pack, Unpack/Fetch ops */ 239 #define DEF_Pack(Type,BS,EQ) \ 240 DEF_PackFunc(Type,BS,EQ) \ 241 DEF_UnpackFunc(Type,BS,EQ) \ 242 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 243 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 244 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 245 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 246 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 247 } 248 249 /* Add, Mult ops */ 250 #define DEF_Add(Type,BS,EQ) \ 251 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 252 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 253 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 255 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 256 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 257 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 258 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 259 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 260 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 261 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 262 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 263 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 264 } 265 266 /* Max, Min ops */ 267 #define DEF_Cmp(Type,BS,EQ) \ 268 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 269 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 270 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 271 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 272 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 273 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 274 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 275 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 276 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 277 } 278 279 /* Logical ops. 280 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 281 the compilation warning "empty macro arguments are undefined in ISO C90" 282 */ 283 #define DEF_Log(Type,BS,EQ) \ 284 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 285 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 286 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 287 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 288 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 289 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 290 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 291 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 292 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 293 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 294 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 295 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 296 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 297 } 298 299 /* Bitwise ops */ 300 #define DEF_Bit(Type,BS,EQ) \ 301 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 302 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 303 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 304 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 305 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 306 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 307 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 308 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 309 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 310 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 311 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 312 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 313 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 314 } 315 316 /* Maxloc, Minloc ops */ 317 #define DEF_Xloc(Type,BS,EQ) \ 318 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 319 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 320 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 321 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 322 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 323 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 324 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 325 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 326 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 327 } 328 329 #define DEF_IntegerType(Type,BS,EQ) \ 330 DEF_Pack(Type,BS,EQ) \ 331 DEF_Add(Type,BS,EQ) \ 332 DEF_Cmp(Type,BS,EQ) \ 333 DEF_Log(Type,BS,EQ) \ 334 DEF_Bit(Type,BS,EQ) \ 335 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 336 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 337 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 338 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 340 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 341 } 342 343 #define DEF_RealType(Type,BS,EQ) \ 344 DEF_Pack(Type,BS,EQ) \ 345 DEF_Add(Type,BS,EQ) \ 346 DEF_Cmp(Type,BS,EQ) \ 347 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 348 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 349 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 350 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 351 } 352 353 #if defined(PETSC_HAVE_COMPLEX) 354 #define DEF_ComplexType(Type,BS,EQ) \ 355 DEF_Pack(Type,BS,EQ) \ 356 DEF_Add(Type,BS,EQ) \ 357 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 358 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 359 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 360 } 361 #endif 362 363 #define DEF_DumbType(Type,BS,EQ) \ 364 DEF_Pack(Type,BS,EQ) \ 365 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 366 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 367 } 368 369 /* Maxloc, Minloc */ 370 #define DEF_PairType(Type,BS,EQ) \ 371 DEF_Pack(Type,BS,EQ) \ 372 DEF_Xloc(Type,BS,EQ) \ 373 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 374 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 375 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 376 } 377 378 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 379 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 380 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 381 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 382 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 383 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 384 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 385 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 386 387 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 388 DEF_IntegerType(int,1,1) 389 DEF_IntegerType(int,2,1) 390 DEF_IntegerType(int,4,1) 391 DEF_IntegerType(int,8,1) 392 DEF_IntegerType(int,1,0) 393 DEF_IntegerType(int,2,0) 394 DEF_IntegerType(int,4,0) 395 DEF_IntegerType(int,8,0) 396 #endif 397 398 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 399 typedef signed char SignedChar; 400 DEF_IntegerType(SignedChar,1,1) 401 DEF_IntegerType(SignedChar,2,1) 402 DEF_IntegerType(SignedChar,4,1) 403 DEF_IntegerType(SignedChar,8,1) 404 DEF_IntegerType(SignedChar,1,0) 405 DEF_IntegerType(SignedChar,2,0) 406 DEF_IntegerType(SignedChar,4,0) 407 DEF_IntegerType(SignedChar,8,0) 408 409 typedef unsigned char UnsignedChar; 410 DEF_IntegerType(UnsignedChar,1,1) 411 DEF_IntegerType(UnsignedChar,2,1) 412 DEF_IntegerType(UnsignedChar,4,1) 413 DEF_IntegerType(UnsignedChar,8,1) 414 DEF_IntegerType(UnsignedChar,1,0) 415 DEF_IntegerType(UnsignedChar,2,0) 416 DEF_IntegerType(UnsignedChar,4,0) 417 DEF_IntegerType(UnsignedChar,8,0) 418 419 DEF_RealType(PetscReal,1,1) 420 DEF_RealType(PetscReal,2,1) 421 DEF_RealType(PetscReal,4,1) 422 DEF_RealType(PetscReal,8,1) 423 DEF_RealType(PetscReal,1,0) 424 DEF_RealType(PetscReal,2,0) 425 DEF_RealType(PetscReal,4,0) 426 DEF_RealType(PetscReal,8,0) 427 428 #if defined(PETSC_HAVE_COMPLEX) 429 DEF_ComplexType(PetscComplex,1,1) 430 DEF_ComplexType(PetscComplex,2,1) 431 DEF_ComplexType(PetscComplex,4,1) 432 DEF_ComplexType(PetscComplex,8,1) 433 DEF_ComplexType(PetscComplex,1,0) 434 DEF_ComplexType(PetscComplex,2,0) 435 DEF_ComplexType(PetscComplex,4,0) 436 DEF_ComplexType(PetscComplex,8,0) 437 #endif 438 439 #define PairType(Type1,Type2) Type1##_##Type2 440 typedef struct {int u; int i;} PairType(int,int); 441 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 442 DEF_PairType(PairType(int,int),1,1) 443 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 444 445 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 446 DEF_DumbType(char,1,1) 447 DEF_DumbType(char,2,1) 448 DEF_DumbType(char,4,1) 449 DEF_DumbType(char,1,0) 450 DEF_DumbType(char,2,0) 451 DEF_DumbType(char,4,0) 452 453 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 454 DEF_DumbType(DumbInt,1,1) 455 DEF_DumbType(DumbInt,2,1) 456 DEF_DumbType(DumbInt,4,1) 457 DEF_DumbType(DumbInt,8,1) 458 DEF_DumbType(DumbInt,1,0) 459 DEF_DumbType(DumbInt,2,0) 460 DEF_DumbType(DumbInt,4,0) 461 DEF_DumbType(DumbInt,8,0) 462 463 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 464 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 465 { 466 int ierr; 467 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 468 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 469 return MPI_SUCCESS; 470 } 471 #endif 472 473 /* 474 The routine Creates a communication link for the given operation. It first looks up its link cache. If 475 there is a free & suitable one, it uses it. Otherwise it creates a new one. 476 477 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 478 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 479 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 480 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 481 482 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 483 484 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 485 486 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 487 need pack/unpack data. 488 */ 489 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 490 { 491 PetscErrorCode ierr; 492 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 493 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 494 PetscSFLink *p,link; 495 PetscSFDirection direction; 496 MPI_Request *reqs = NULL; 497 PetscBool match,rootdirect[2],leafdirect[2]; 498 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 499 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 500 501 PetscFunctionBegin; 502 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 503 504 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 505 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 506 if (sfop == PETSCSF_BCAST) { 507 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 508 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 509 } else if (sfop == PETSCSF_REDUCE) { 510 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 511 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 512 } else { /* PETSCSF_FETCH */ 513 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 514 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 515 } 516 } 517 518 if (sf->use_gpu_aware_mpi) { 519 rootmtype_mpi = rootmtype; 520 leafmtype_mpi = leafmtype; 521 } else { 522 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 523 } 524 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 525 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 526 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 527 528 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 529 nrootreqs = bas->nrootreqs; 530 nleafreqs = sf->nleafreqs; 531 532 /* Look for free links in cache */ 533 for (p=&bas->avail; (link=*p); p=&link->next) { 534 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 535 if (match) { 536 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 537 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 538 */ 539 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 540 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 541 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 542 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 543 } 544 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 545 reqs = link->leafreqs[direction][leafmtype][1]; 546 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 547 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 548 } 549 *p = link->next; /* Remove from available list */ 550 goto found; 551 } 552 } 553 554 ierr = PetscNew(&link);CHKERRQ(ierr); 555 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 556 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 557 558 nreqs = (nrootreqs+nleafreqs)*8; 559 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 560 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 561 562 for (i=0; i<2; i++) { /* Two communication directions */ 563 for (j=0; j<2; j++) { /* Two memory types */ 564 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 565 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 566 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 567 } 568 } 569 } 570 571 found: 572 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 573 574 /* Allocate buffers along root/leafdata */ 575 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 576 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 577 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 578 if (bas->rootbuflen[i]) { 579 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 580 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 581 } else { /* Have to have a separate rootbuf */ 582 if (!link->rootbuf_alloc[i][rootmtype]) { 583 ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 584 } 585 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 586 } 587 } 588 589 if (sf->leafbuflen[i]) { 590 if (leafdirect[i]) { 591 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 592 } else { 593 if (!link->leafbuf_alloc[i][leafmtype]) { 594 ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 595 } 596 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 597 } 598 } 599 } 600 601 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 602 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 603 if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 604 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 605 } 606 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 607 } 608 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 609 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 610 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 611 } 612 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 613 } 614 615 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 616 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 617 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 618 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 619 } 620 621 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 622 link->leafdata = leafdata; 623 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 624 link->rootdirect[i] = rootdirect[i]; 625 link->leafdirect[i] = leafdirect[i]; 626 } 627 link->rootdirect_mpi = rootdirect_mpi; 628 link->leafdirect_mpi = leafdirect_mpi; 629 link->rootmtype = rootmtype; 630 link->leafmtype = leafmtype; 631 link->rootmtype_mpi = rootmtype_mpi; 632 link->leafmtype_mpi = leafmtype_mpi; 633 634 link->next = bas->inuse; 635 bas->inuse = link; 636 *mylink = link; 637 PetscFunctionReturn(0); 638 } 639 640 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 641 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 642 */ 643 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 644 { 645 PetscErrorCode ierr; 646 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 647 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 648 const PetscInt *rootoffset,*leafoffset; 649 PetscMPIInt n; 650 MPI_Aint disp; 651 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 652 MPI_Datatype unit = link->unit; 653 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 654 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 655 656 PetscFunctionBegin; 657 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 658 if (sf->persistent) { 659 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 660 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 661 if (direction == PETSCSF_LEAF2ROOT) { 662 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 663 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 664 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 665 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 666 } 667 } else { /* PETSCSF_ROOT2LEAF */ 668 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 669 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 670 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 671 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 672 } 673 } 674 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 675 } 676 677 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 678 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 679 if (direction == PETSCSF_LEAF2ROOT) { 680 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 681 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 682 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 683 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 684 } 685 } else { /* PETSCSF_ROOT2LEAF */ 686 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 687 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 688 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 689 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 690 } 691 } 692 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 693 } 694 } 695 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 696 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 697 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 698 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 699 PetscFunctionReturn(0); 700 } 701 702 703 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 704 { 705 PetscErrorCode ierr; 706 PetscSFLink link,*p; 707 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 708 709 PetscFunctionBegin; 710 /* Look for types in cache */ 711 for (p=&bas->inuse; (link=*p); p=&link->next) { 712 PetscBool match; 713 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 714 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 715 switch (cmode) { 716 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 717 case PETSC_USE_POINTER: break; 718 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 719 } 720 *mylink = link; 721 PetscFunctionReturn(0); 722 } 723 } 724 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 725 PetscFunctionReturn(0); 726 } 727 728 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 729 { 730 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 731 732 PetscFunctionBegin; 733 (*link)->rootdata = NULL; 734 (*link)->leafdata = NULL; 735 (*link)->next = bas->avail; 736 bas->avail = *link; 737 *link = NULL; 738 PetscFunctionReturn(0); 739 } 740 741 /* Destroy all links chained in 'avail' */ 742 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 743 { 744 PetscErrorCode ierr; 745 PetscSFLink link = *avail,next; 746 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 747 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 748 749 PetscFunctionBegin; 750 for (; link; link=next) { 751 next = link->next; 752 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 753 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 754 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 755 } 756 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 757 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 758 #if defined(PETSC_HAVE_CUDA) 759 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 760 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 761 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 762 #endif 763 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 764 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 765 } 766 ierr = PetscFree(link);CHKERRQ(ierr); 767 } 768 *avail = NULL; 769 PetscFunctionReturn(0); 770 } 771 772 #if defined(PETSC_USE_DEBUG) 773 /* Error out on unsupported overlapped communications */ 774 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 775 { 776 PetscErrorCode ierr; 777 PetscSFLink link,*p; 778 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 779 PetscBool match; 780 781 PetscFunctionBegin; 782 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 783 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 784 */ 785 if (rootdata || leafdata) { 786 for (p=&bas->inuse; (link=*p); p=&link->next) { 787 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 788 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 789 } 790 } 791 PetscFunctionReturn(0); 792 } 793 #endif 794 795 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 796 { 797 PetscErrorCode ierr; 798 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 799 PetscBool is2Int,is2PetscInt; 800 PetscMPIInt ni,na,nd,combiner; 801 #if defined(PETSC_HAVE_COMPLEX) 802 PetscInt nPetscComplex=0; 803 #endif 804 805 PetscFunctionBegin; 806 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 807 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 808 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 809 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 810 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 811 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 812 #if defined(PETSC_HAVE_COMPLEX) 813 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 814 #endif 815 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 816 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 817 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 818 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 819 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 820 link->bs = 1; /* default */ 821 822 if (is2Int) { 823 PackInit_PairType_int_int_1_1(link); 824 link->bs = 1; 825 link->unitbytes = 2*sizeof(int); 826 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 827 link->basicunit = MPI_2INT; 828 link->unit = MPI_2INT; 829 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 830 PackInit_PairType_PetscInt_PetscInt_1_1(link); 831 link->bs = 1; 832 link->unitbytes = 2*sizeof(PetscInt); 833 link->basicunit = MPIU_2INT; 834 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 835 link->unit = MPIU_2INT; 836 } else if (nPetscReal) { 837 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 838 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 839 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 840 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 841 link->bs = nPetscReal; 842 link->unitbytes = nPetscReal*sizeof(PetscReal); 843 link->basicunit = MPIU_REAL; 844 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 845 } else if (nPetscInt) { 846 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 847 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 848 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 849 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 850 link->bs = nPetscInt; 851 link->unitbytes = nPetscInt*sizeof(PetscInt); 852 link->basicunit = MPIU_INT; 853 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 854 #if defined(PETSC_USE_64BIT_INDICES) 855 } else if (nInt) { 856 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 857 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 858 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 859 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 860 link->bs = nInt; 861 link->unitbytes = nInt*sizeof(int); 862 link->basicunit = MPI_INT; 863 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 864 #endif 865 } else if (nSignedChar) { 866 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 867 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 868 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 869 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 870 link->bs = nSignedChar; 871 link->unitbytes = nSignedChar*sizeof(SignedChar); 872 link->basicunit = MPI_SIGNED_CHAR; 873 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 874 } else if (nUnsignedChar) { 875 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 876 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 877 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 878 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 879 link->bs = nUnsignedChar; 880 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 881 link->basicunit = MPI_UNSIGNED_CHAR; 882 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 883 #if defined(PETSC_HAVE_COMPLEX) 884 } else if (nPetscComplex) { 885 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 886 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 887 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 888 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 889 link->bs = nPetscComplex; 890 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 891 link->basicunit = MPIU_COMPLEX; 892 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 893 #endif 894 } else { 895 MPI_Aint lb,nbyte; 896 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 897 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 898 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 899 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 900 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 901 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 902 link->bs = nbyte; 903 link->unitbytes = nbyte; 904 link->basicunit = MPI_BYTE; 905 } else { 906 nInt = nbyte / sizeof(int); 907 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 908 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 909 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 910 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 911 link->bs = nInt; 912 link->unitbytes = nbyte; 913 link->basicunit = MPI_INT; 914 } 915 if (link->isbuiltin) link->unit = unit; 916 } 917 918 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 923 { 924 PetscFunctionBegin; 925 *UnpackAndOp = NULL; 926 if (mtype == PETSC_MEMTYPE_HOST) { 927 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 928 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 929 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 930 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 931 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 932 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 933 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 934 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 935 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 936 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 937 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 938 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 939 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 940 } 941 #if defined(PETSC_HAVE_CUDA) 942 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 943 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 944 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 945 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 946 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 947 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 948 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 949 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 950 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 951 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 952 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 953 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 954 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 955 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 956 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 957 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 958 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 959 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 960 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 961 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 962 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 963 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 964 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 965 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 966 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 967 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 968 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 969 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 970 } 971 #endif 972 PetscFunctionReturn(0); 973 } 974 975 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 976 { 977 PetscFunctionBegin; 978 *ScatterAndOp = NULL; 979 if (mtype == PETSC_MEMTYPE_HOST) { 980 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 981 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 982 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 983 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 984 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 985 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 986 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 987 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 988 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 989 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 990 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 991 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 992 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 993 } 994 #if defined(PETSC_HAVE_CUDA) 995 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 996 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 997 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 998 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 999 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1000 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1001 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1002 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1003 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1004 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1005 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1006 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1007 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1008 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1009 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1010 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1011 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1012 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1013 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1014 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1015 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1016 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1017 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1018 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1019 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1020 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1021 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1022 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1023 } 1024 #endif 1025 PetscFunctionReturn(0); 1026 } 1027 1028 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1029 { 1030 PetscFunctionBegin; 1031 *FetchAndOp = NULL; 1032 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1033 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1034 #if defined(PETSC_HAVE_CUDA) 1035 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1036 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1037 #endif 1038 PetscFunctionReturn(0); 1039 } 1040 1041 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1042 { 1043 PetscFunctionBegin; 1044 *FetchAndOpLocal = NULL; 1045 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1046 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1047 #if defined(PETSC_HAVE_CUDA) 1048 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1049 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1050 #endif 1051 PetscFunctionReturn(0); 1052 } 1053 1054 /*============================================================================= 1055 A set of helper routines for Pack/Unpack/Scatter on GPUs 1056 ============================================================================*/ 1057 #if defined(PETSC_HAVE_CUDA) 1058 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1059 make sure the data is ready for packing. 1060 */ 1061 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1062 { 1063 PetscFunctionBegin; 1064 if (sf->use_default_stream) PetscFunctionReturn(0); 1065 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1066 cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1067 } 1068 PetscFunctionReturn(0); 1069 } 1070 1071 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1072 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1073 { 1074 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1075 1076 PetscFunctionBegin; 1077 /* Do nothing if we use stream aware mpi || has nothing for remote */ 1078 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1079 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 1080 if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) { 1081 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1082 } 1083 PetscFunctionReturn(0); 1084 } 1085 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1086 { 1087 PetscFunctionBegin; 1088 /* See comments above */ 1089 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1090 if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) { 1091 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1092 } 1093 PetscFunctionReturn(0); 1094 } 1095 1096 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1097 does not know which stream the callers will use. 1098 */ 1099 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1100 { 1101 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1102 1103 PetscFunctionBegin; 1104 /* Do nothing if we are expected to put rootdata on default stream */ 1105 if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1106 /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it; 1107 If we have something from remote and we called unpack kernel, then we must also sycn it. 1108 */ 1109 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) { 1110 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1111 } 1112 PetscFunctionReturn(0); 1113 } 1114 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1115 { 1116 PetscFunctionBegin; 1117 /* See comments above */ 1118 if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1119 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) { 1120 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1121 } 1122 PetscFunctionReturn(0); 1123 } 1124 1125 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1126 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1127 */ 1128 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1129 { 1130 PetscErrorCode ierr; 1131 cudaError_t cerr; 1132 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1133 1134 PetscFunctionBegin; 1135 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1136 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1137 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1138 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1139 if (device2host) { 1140 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1141 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1142 } else { 1143 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1144 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1145 } 1146 } 1147 PetscFunctionReturn(0); 1148 } 1149 1150 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1151 { 1152 PetscErrorCode ierr; 1153 cudaError_t cerr; 1154 1155 PetscFunctionBegin; 1156 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1157 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1158 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1159 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1160 if (device2host) { 1161 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1162 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1163 } else { 1164 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1165 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1166 } 1167 } 1168 PetscFunctionReturn(0); 1169 } 1170 #else 1171 1172 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1173 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1174 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1175 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1176 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1177 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1178 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1179 1180 #endif 1181 1182 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1183 { 1184 PetscErrorCode ierr; 1185 PetscLogDouble flops; 1186 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1187 1188 1189 PetscFunctionBegin; 1190 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1191 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1192 #if defined(PETSC_HAVE_CUDA) 1193 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1194 #endif 1195 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1196 } 1197 PetscFunctionReturn(0); 1198 } 1199 1200 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1201 { 1202 PetscLogDouble flops; 1203 PetscErrorCode ierr; 1204 1205 PetscFunctionBegin; 1206 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1207 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1208 #if defined(PETSC_HAVE_CUDA) 1209 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1210 #endif 1211 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1212 } 1213 PetscFunctionReturn(0); 1214 } 1215 1216 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1217 Input Arguments: 1218 +sf - The StarForest 1219 .link - The link 1220 .count - Number of entries to unpack 1221 .start - The first index, significent when indices=NULL 1222 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1223 .buf - A contiguous buffer to unpack from 1224 -op - Operation after unpack 1225 1226 Output Arguments: 1227 .data - The data to unpack to 1228 */ 1229 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1230 { 1231 PetscFunctionBegin; 1232 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1233 { 1234 PetscErrorCode ierr; 1235 PetscInt i; 1236 PetscMPIInt n; 1237 if (indices) { 1238 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1239 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1240 */ 1241 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1242 } else { 1243 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1244 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1245 } 1246 } 1247 #else 1248 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1249 #endif 1250 PetscFunctionReturn(0); 1251 } 1252 1253 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1254 { 1255 PetscFunctionBegin; 1256 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1257 { 1258 PetscErrorCode ierr; 1259 PetscInt i,disp; 1260 if (!srcIdx) { 1261 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1262 } else { 1263 for (i=0; i<count; i++) { 1264 disp = dstIdx? dstIdx[i] : dstStart + i; 1265 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1266 } 1267 } 1268 } 1269 #else 1270 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1271 #endif 1272 PetscFunctionReturn(0); 1273 } 1274 1275 /*============================================================================= 1276 Pack/Unpack/Fetch/Scatter routines 1277 ============================================================================*/ 1278 1279 /* Pack rootdata to rootbuf 1280 Input Arguments: 1281 + sf - The SF this packing works on. 1282 . link - It gives the memtype of the roots and also provides root buffer. 1283 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1284 - rootdata - Where to read the roots. 1285 1286 Notes: 1287 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1288 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1289 */ 1290 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1291 { 1292 PetscErrorCode ierr; 1293 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1294 const PetscInt *rootindices = NULL; 1295 PetscInt count,start; 1296 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1297 PetscMemType rootmtype = link->rootmtype; 1298 PetscSFPackOpt opt = NULL; 1299 1300 1301 PetscFunctionBegin; 1302 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1303 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1304 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1305 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1306 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1307 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1308 } 1309 if (scope == PETSCSF_REMOTE) { 1310 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1311 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1312 } 1313 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1314 PetscFunctionReturn(0); 1315 } 1316 1317 /* Pack leafdata to leafbuf */ 1318 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1319 { 1320 PetscErrorCode ierr; 1321 const PetscInt *leafindices = NULL; 1322 PetscInt count,start; 1323 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1324 PetscMemType leafmtype = link->leafmtype; 1325 PetscSFPackOpt opt = NULL; 1326 1327 PetscFunctionBegin; 1328 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1329 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1330 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1331 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1332 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1333 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1334 } 1335 if (scope == PETSCSF_REMOTE) { 1336 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1337 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1338 } 1339 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1340 PetscFunctionReturn(0); 1341 } 1342 1343 /* Unpack rootbuf to rootdata */ 1344 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1345 { 1346 PetscErrorCode ierr; 1347 const PetscInt *rootindices = NULL; 1348 PetscInt count,start; 1349 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1350 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1351 PetscMemType rootmtype = link->rootmtype; 1352 PetscSFPackOpt opt = NULL; 1353 1354 PetscFunctionBegin; 1355 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1356 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1357 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1358 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1359 if (UnpackAndOp) { 1360 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1361 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1362 } else { 1363 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1364 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1365 } 1366 } 1367 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1368 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1369 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 /* Unpack leafbuf to leafdata */ 1374 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1375 { 1376 PetscErrorCode ierr; 1377 const PetscInt *leafindices = NULL; 1378 PetscInt count,start; 1379 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1380 PetscMemType leafmtype = link->leafmtype; 1381 PetscSFPackOpt opt = NULL; 1382 1383 PetscFunctionBegin; 1384 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1385 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1386 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1387 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1388 if (UnpackAndOp) { 1389 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1390 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1391 } else { 1392 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1393 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1394 } 1395 } 1396 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1397 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1398 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1399 PetscFunctionReturn(0); 1400 } 1401 1402 /* FetchAndOp rootdata with rootbuf */ 1403 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1404 { 1405 PetscErrorCode ierr; 1406 const PetscInt *rootindices = NULL; 1407 PetscInt count,start; 1408 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1409 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1410 PetscMemType rootmtype = link->rootmtype; 1411 PetscSFPackOpt opt = NULL; 1412 1413 PetscFunctionBegin; 1414 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1415 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1416 if (bas->rootbuflen[scope]) { 1417 /* Do FetchAndOp on rootdata with rootbuf */ 1418 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1419 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1420 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1421 } 1422 if (scope == PETSCSF_REMOTE) { 1423 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1424 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1425 } 1426 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1427 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430 1431 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1432 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1433 { 1434 PetscErrorCode ierr; 1435 const PetscInt *rootindices = NULL,*leafindices = NULL; 1436 PetscInt count,rootstart,leafstart; 1437 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1438 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1439 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1440 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1441 1442 PetscFunctionBegin; 1443 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1444 if (rootmtype != leafmtype) { /* Uncommon case */ 1445 /* The local communication has to go through pack and unpack */ 1446 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1447 ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1448 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1449 } else { 1450 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1451 if (ScatterAndOp) { 1452 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1453 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1454 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1455 } else { 1456 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1457 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1458 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1459 } 1460 } 1461 PetscFunctionReturn(0); 1462 } 1463 1464 /* Reduce leafdata to rootdata locally */ 1465 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1466 { 1467 PetscErrorCode ierr; 1468 const PetscInt *rootindices = NULL,*leafindices = NULL; 1469 PetscInt count,rootstart,leafstart; 1470 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1471 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1472 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1473 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1474 1475 PetscFunctionBegin; 1476 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1477 if (rootmtype != leafmtype) { 1478 /* The local communication has to go through pack and unpack */ 1479 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1480 ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1481 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1482 } else { 1483 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1484 if (ScatterAndOp) { 1485 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1486 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1487 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1488 } else { 1489 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1490 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1491 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1492 } 1493 } 1494 PetscFunctionReturn(0); 1495 } 1496 1497 /* Fetch rootdata to leafdata and leafupdate locally */ 1498 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1499 { 1500 PetscErrorCode ierr; 1501 const PetscInt *rootindices = NULL,*leafindices = NULL; 1502 PetscInt count,rootstart,leafstart; 1503 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1504 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1505 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1506 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1507 1508 PetscFunctionBegin; 1509 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1510 if (rootmtype != leafmtype) { 1511 /* The local communication has to go through pack and unpack */ 1512 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1513 } else { 1514 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1515 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1516 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1517 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1518 } 1519 PetscFunctionReturn(0); 1520 } 1521 1522 /* 1523 Create per-rank pack/unpack optimizations based on indice patterns 1524 1525 Input Parameters: 1526 + n - Number of destination ranks 1527 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1528 - idx - [*] Array storing indices 1529 1530 Output Parameters: 1531 + opt - Pack optimizations. NULL if no optimizations. 1532 */ 1533 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1534 { 1535 PetscErrorCode ierr; 1536 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1537 PetscBool optimizable = PETSC_TRUE; 1538 PetscSFPackOpt opt; 1539 1540 PetscFunctionBegin; 1541 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1542 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1543 opt->n = opt->array[0] = n; 1544 opt->offset = opt->array + 1; 1545 opt->start = opt->array + n + 2; 1546 opt->dx = opt->array + 2*n + 2; 1547 opt->dy = opt->array + 3*n + 2; 1548 opt->dz = opt->array + 4*n + 2; 1549 opt->X = opt->array + 5*n + 2; 1550 opt->Y = opt->array + 6*n + 2; 1551 1552 for (r=0; r<n; r++) { /* For each destination rank */ 1553 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1554 p = offset[r]; 1555 start = idx[p]; /* First index for this rank */ 1556 p++; 1557 1558 /* Search in X dimension */ 1559 for (dx=1; dx<m; dx++,p++) { 1560 if (start+dx != idx[p]) break; 1561 } 1562 1563 dydz = m/dx; 1564 X = dydz > 1 ? (idx[p]-start) : dx; 1565 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1566 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1567 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1568 for (i=0; i<dx; i++,p++) { 1569 if (start+X*dy+i != idx[p]) { 1570 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1571 else goto Z_dimension; 1572 } 1573 } 1574 } 1575 1576 Z_dimension: 1577 dz = m/(dx*dy); 1578 Y = dz > 1 ? (idx[p]-start)/X : dy; 1579 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1580 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1581 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1582 for (j=0; j<dy; j++) { 1583 for (i=0; i<dx; i++,p++) { 1584 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1585 } 1586 } 1587 } 1588 opt->start[r] = start; 1589 opt->dx[r] = dx; 1590 opt->dy[r] = dy; 1591 opt->dz[r] = dz; 1592 opt->X[r] = X; 1593 opt->Y[r] = Y; 1594 } 1595 1596 finish: 1597 /* If not optimizable, free arrays to save memory */ 1598 if (!n || !optimizable) { 1599 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1600 ierr = PetscFree(opt);CHKERRQ(ierr); 1601 *out = NULL; 1602 } else { 1603 opt->offset[0] = 0; 1604 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1605 *out = opt; 1606 } 1607 PetscFunctionReturn(0); 1608 } 1609 1610 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out) 1611 { 1612 PetscErrorCode ierr; 1613 PetscSFPackOpt opt = *out; 1614 1615 PetscFunctionBegin; 1616 if (opt) { 1617 if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);} 1618 #if defined(PETSC_HAVE_CUDA) 1619 else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;} 1620 #endif 1621 ierr = PetscFree(opt);CHKERRQ(ierr); 1622 *out = NULL; 1623 } 1624 PetscFunctionReturn(0); 1625 } 1626 1627 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1628 { 1629 PetscErrorCode ierr; 1630 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1631 PetscInt i,j; 1632 1633 PetscFunctionBegin; 1634 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1635 for (i=0; i<2; i++) { /* Set defaults */ 1636 sf->leafstart[i] = 0; 1637 sf->leafcontig[i] = PETSC_TRUE; 1638 sf->leafdups[i] = PETSC_FALSE; 1639 bas->rootstart[i] = 0; 1640 bas->rootcontig[i] = PETSC_TRUE; 1641 bas->rootdups[i] = PETSC_FALSE; 1642 } 1643 1644 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1645 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1646 1647 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1648 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1649 1650 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1651 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1652 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1653 } 1654 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1655 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1656 } 1657 1658 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1659 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1660 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1661 1662 /* Are root indices for self and remote contiguous? */ 1663 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1664 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1665 1666 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1667 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1668 1669 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1670 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1671 } 1672 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1673 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1674 } 1675 1676 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1677 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1678 1679 #if defined(PETSC_HAVE_CUDA) 1680 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1681 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1682 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1683 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1684 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1685 #endif 1686 1687 PetscFunctionReturn(0); 1688 } 1689 1690 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1691 { 1692 PetscErrorCode ierr; 1693 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1694 PetscInt i; 1695 1696 PetscFunctionBegin; 1697 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1698 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1699 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1700 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1701 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1702 } 1703 PetscFunctionReturn(0); 1704 } 1705