1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 3 4 #if defined(PETSC_HAVE_CUDA) 5 #include <cuda_runtime.h> 6 #endif 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19 /* Ref MPI MAXLOC */ 20 #define OP_XLOC(op,s,t) \ 21 do { \ 22 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23 else if (!((s).u op (t).u)) s = t; \ 24 } while (0) 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 .BS Block size for vectorization. It is a factor of bsz. 31 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 32 33 Arguments of the Pack routine: 34 +count Number of indices in idx[]. 35 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36 .opt Per-pack optimization plan. NULL means no such plan. 37 .idx Indices of entries to packed. 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40 -packed Address of the packed data. 41 */ 42 #define DEF_PackFunc(Type,BS,EQ) \ 43 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44 { \ 45 PetscErrorCode ierr; \ 46 const Type *u = (const Type*)unpacked,*u2; \ 47 Type *p = (Type*)packed,*p2; \ 48 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 49 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 50 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 51 PetscFunctionBegin; \ 52 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 53 else if (opt) { /* has optimizations available */ \ 54 p2 = p; \ 55 for (r=0; r<opt->n; r++) { \ 56 u2 = u + opt->start[r]*MBS; \ 57 X = opt->X[r]; \ 58 Y = opt->Y[r]; \ 59 for (k=0; k<opt->dz[r]; k++) \ 60 for (j=0; j<opt->dy[r]; j++) { \ 61 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 62 p2 += opt->dx[r]*MBS; \ 63 } \ 64 } \ 65 } else { \ 66 for (i=0; i<count; i++) \ 67 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 68 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 69 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 70 } \ 71 PetscFunctionReturn(0); \ 72 } 73 74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 75 and inserts into a sparse array. 76 77 Arguments: 78 .Type Type of the data 79 .BS Block size for vectorization 80 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 81 82 Notes: 83 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 84 */ 85 #define DEF_UnpackFunc(Type,BS,EQ) \ 86 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 87 { \ 88 PetscErrorCode ierr; \ 89 Type *u = (Type*)unpacked,*u2; \ 90 const Type *p = (const Type*)packed; \ 91 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 92 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 93 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 94 PetscFunctionBegin; \ 95 if (!idx) { \ 96 u += start*MBS; \ 97 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 98 } else if (opt) { /* has optimizations available */ \ 99 for (r=0; r<opt->n; r++) { \ 100 u2 = u + opt->start[r]*MBS; \ 101 X = opt->X[r]; \ 102 Y = opt->Y[r]; \ 103 for (k=0; k<opt->dz[r]; k++) \ 104 for (j=0; j<opt->dy[r]; j++) { \ 105 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 106 p += opt->dx[r]*MBS; \ 107 } \ 108 } \ 109 } else { \ 110 for (i=0; i<count; i++) \ 111 for (j=0; j<M; j++) \ 112 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 113 } \ 114 PetscFunctionReturn(0); \ 115 } 116 117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 118 119 Arguments: 120 +Opname Name of the Op, such as Add, Mult, LAND, etc. 121 .Type Type of the data 122 .BS Block size for vectorization 123 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 124 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 125 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 126 */ 127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 128 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 129 { \ 130 Type *u = (Type*)unpacked,*u2; \ 131 const Type *p = (const Type*)packed; \ 132 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 133 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 134 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 135 PetscFunctionBegin; \ 136 if (!idx) { \ 137 u += start*MBS; \ 138 for (i=0; i<count; i++) \ 139 for (j=0; j<M; j++) \ 140 for (k=0; k<BS; k++) \ 141 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 142 } else if (opt) { /* idx[] has patterns */ \ 143 for (r=0; r<opt->n; r++) { \ 144 u2 = u + opt->start[r]*MBS; \ 145 X = opt->X[r]; \ 146 Y = opt->Y[r]; \ 147 for (k=0; k<opt->dz[r]; k++) \ 148 for (j=0; j<opt->dy[r]; j++) { \ 149 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 150 p += opt->dx[r]*MBS; \ 151 } \ 152 } \ 153 } else { \ 154 for (i=0; i<count; i++) \ 155 for (j=0; j<M; j++) \ 156 for (k=0; k<BS; k++) \ 157 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 158 } \ 159 PetscFunctionReturn(0); \ 160 } 161 162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 163 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 164 { \ 165 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 166 PetscInt i,j,k,r,l,bs=link->bs; \ 167 const PetscInt M = (EQ) ? 1 : bs/BS; \ 168 const PetscInt MBS = M*BS; \ 169 PetscFunctionBegin; \ 170 for (i=0; i<count; i++) { \ 171 r = (!idx ? start+i : idx[i])*MBS; \ 172 l = i*MBS; \ 173 for (j=0; j<M; j++) \ 174 for (k=0; k<BS; k++) { \ 175 tmp = u[r+j*BS+k]; \ 176 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 177 p[l+j*BS+k] = tmp; \ 178 } \ 179 } \ 180 PetscFunctionReturn(0); \ 181 } 182 183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 184 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 185 { \ 186 PetscErrorCode ierr; \ 187 const Type *u = (const Type*)src; \ 188 Type *v = (Type*)dst; \ 189 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 190 const PetscInt M = (EQ) ? 1 : bs/BS; \ 191 const PetscInt MBS = M*BS; \ 192 PetscFunctionBegin; \ 193 if (!srcIdx) { /* src is contiguous */ \ 194 u += srcStart*MBS; \ 195 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 196 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 197 u += srcOpt->start[0]*MBS; \ 198 v += dstStart*MBS; \ 199 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 200 for (k=0; k<srcOpt->dz[0]; k++) \ 201 for (j=0; j<srcOpt->dy[0]; j++) { \ 202 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 203 v += srcOpt->dx[0]*MBS; \ 204 } \ 205 } else { /* all other cases */ \ 206 for (i=0; i<count; i++) { \ 207 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 208 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 209 for (j=0; j<M; j++) \ 210 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 211 } \ 212 } \ 213 PetscFunctionReturn(0); \ 214 } 215 216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 217 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 218 { \ 219 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 220 const Type *ldata = (const Type*)leafdata; \ 221 PetscInt i,j,k,r,l,bs = link->bs; \ 222 const PetscInt M = (EQ) ? 1 : bs/BS; \ 223 const PetscInt MBS = M*BS; \ 224 PetscFunctionBegin; \ 225 for (i=0; i<count; i++) { \ 226 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 227 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 228 for (j=0; j<M; j++) \ 229 for (k=0; k<BS; k++) { \ 230 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 231 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 232 } \ 233 } \ 234 PetscFunctionReturn(0); \ 235 } 236 237 /* Pack, Unpack/Fetch ops */ 238 #define DEF_Pack(Type,BS,EQ) \ 239 DEF_PackFunc(Type,BS,EQ) \ 240 DEF_UnpackFunc(Type,BS,EQ) \ 241 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 242 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 243 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 244 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 245 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 246 } 247 248 /* Add, Mult ops */ 249 #define DEF_Add(Type,BS,EQ) \ 250 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 253 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 256 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type,BS,EQ) \ 267 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 268 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 269 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 270 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 271 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 272 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 273 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 274 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 275 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 276 } 277 278 /* Logical ops. 279 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 280 the compilation warning "empty macro arguments are undefined in ISO C90" 281 */ 282 #define DEF_Log(Type,BS,EQ) \ 283 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 284 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 285 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 286 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 287 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 288 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 289 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 291 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 292 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 293 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 294 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 295 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 296 } 297 298 /* Bitwise ops */ 299 #define DEF_Bit(Type,BS,EQ) \ 300 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 301 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 302 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 303 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 304 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 305 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 306 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 307 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 308 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 309 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 310 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 311 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 312 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 313 } 314 315 /* Maxloc, Minloc ops */ 316 #define DEF_Xloc(Type,BS,EQ) \ 317 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 318 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 319 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 320 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 321 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 322 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 323 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 324 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 325 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 326 } 327 328 #define DEF_IntegerType(Type,BS,EQ) \ 329 DEF_Pack(Type,BS,EQ) \ 330 DEF_Add(Type,BS,EQ) \ 331 DEF_Cmp(Type,BS,EQ) \ 332 DEF_Log(Type,BS,EQ) \ 333 DEF_Bit(Type,BS,EQ) \ 334 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 335 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 336 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 337 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 338 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 340 } 341 342 #define DEF_RealType(Type,BS,EQ) \ 343 DEF_Pack(Type,BS,EQ) \ 344 DEF_Add(Type,BS,EQ) \ 345 DEF_Cmp(Type,BS,EQ) \ 346 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 347 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 348 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 349 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 350 } 351 352 #if defined(PETSC_HAVE_COMPLEX) 353 #define DEF_ComplexType(Type,BS,EQ) \ 354 DEF_Pack(Type,BS,EQ) \ 355 DEF_Add(Type,BS,EQ) \ 356 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 357 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 358 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 359 } 360 #endif 361 362 #define DEF_DumbType(Type,BS,EQ) \ 363 DEF_Pack(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 } 367 368 /* Maxloc, Minloc */ 369 #define DEF_PairType(Type,BS,EQ) \ 370 DEF_Pack(Type,BS,EQ) \ 371 DEF_Xloc(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 375 } 376 377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 385 386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 387 DEF_IntegerType(int,1,1) 388 DEF_IntegerType(int,2,1) 389 DEF_IntegerType(int,4,1) 390 DEF_IntegerType(int,8,1) 391 DEF_IntegerType(int,1,0) 392 DEF_IntegerType(int,2,0) 393 DEF_IntegerType(int,4,0) 394 DEF_IntegerType(int,8,0) 395 #endif 396 397 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 398 typedef signed char SignedChar; 399 DEF_IntegerType(SignedChar,1,1) 400 DEF_IntegerType(SignedChar,2,1) 401 DEF_IntegerType(SignedChar,4,1) 402 DEF_IntegerType(SignedChar,8,1) 403 DEF_IntegerType(SignedChar,1,0) 404 DEF_IntegerType(SignedChar,2,0) 405 DEF_IntegerType(SignedChar,4,0) 406 DEF_IntegerType(SignedChar,8,0) 407 408 typedef unsigned char UnsignedChar; 409 DEF_IntegerType(UnsignedChar,1,1) 410 DEF_IntegerType(UnsignedChar,2,1) 411 DEF_IntegerType(UnsignedChar,4,1) 412 DEF_IntegerType(UnsignedChar,8,1) 413 DEF_IntegerType(UnsignedChar,1,0) 414 DEF_IntegerType(UnsignedChar,2,0) 415 DEF_IntegerType(UnsignedChar,4,0) 416 DEF_IntegerType(UnsignedChar,8,0) 417 418 DEF_RealType(PetscReal,1,1) 419 DEF_RealType(PetscReal,2,1) 420 DEF_RealType(PetscReal,4,1) 421 DEF_RealType(PetscReal,8,1) 422 DEF_RealType(PetscReal,1,0) 423 DEF_RealType(PetscReal,2,0) 424 DEF_RealType(PetscReal,4,0) 425 DEF_RealType(PetscReal,8,0) 426 427 #if defined(PETSC_HAVE_COMPLEX) 428 DEF_ComplexType(PetscComplex,1,1) 429 DEF_ComplexType(PetscComplex,2,1) 430 DEF_ComplexType(PetscComplex,4,1) 431 DEF_ComplexType(PetscComplex,8,1) 432 DEF_ComplexType(PetscComplex,1,0) 433 DEF_ComplexType(PetscComplex,2,0) 434 DEF_ComplexType(PetscComplex,4,0) 435 DEF_ComplexType(PetscComplex,8,0) 436 #endif 437 438 #define PairType(Type1,Type2) Type1##_##Type2 439 typedef struct {int u; int i;} PairType(int,int); 440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 441 DEF_PairType(PairType(int,int),1,1) 442 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 443 444 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 445 DEF_DumbType(char,1,1) 446 DEF_DumbType(char,2,1) 447 DEF_DumbType(char,4,1) 448 DEF_DumbType(char,1,0) 449 DEF_DumbType(char,2,0) 450 DEF_DumbType(char,4,0) 451 452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 453 DEF_DumbType(DumbInt,1,1) 454 DEF_DumbType(DumbInt,2,1) 455 DEF_DumbType(DumbInt,4,1) 456 DEF_DumbType(DumbInt,8,1) 457 DEF_DumbType(DumbInt,1,0) 458 DEF_DumbType(DumbInt,2,0) 459 DEF_DumbType(DumbInt,4,0) 460 DEF_DumbType(DumbInt,8,0) 461 462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 464 { 465 int ierr; 466 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 467 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 468 return MPI_SUCCESS; 469 } 470 #endif 471 472 /* 473 The routine Creates a communication link for the given operation. It first looks up its link cache. If 474 there is a free & suitable one, it uses it. Otherwise it creates a new one. 475 476 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 477 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 478 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 479 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 480 481 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 482 483 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 484 485 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 486 need pack/unpack data. 487 */ 488 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 489 { 490 PetscErrorCode ierr; 491 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 492 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 493 PetscSFLink *p,link; 494 PetscSFDirection direction; 495 MPI_Request *reqs = NULL; 496 PetscBool match,rootdirect[2],leafdirect[2]; 497 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 498 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 499 500 PetscFunctionBegin; 501 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 502 503 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 504 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 505 if (sfop == PETSCSF_BCAST) { 506 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 507 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 508 } else if (sfop == PETSCSF_REDUCE) { 509 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 510 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 511 } else { /* PETSCSF_FETCH */ 512 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 513 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 514 } 515 } 516 517 if (sf->use_gpu_aware_mpi) { 518 rootmtype_mpi = rootmtype; 519 leafmtype_mpi = leafmtype; 520 } else { 521 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 522 } 523 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 524 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 525 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 526 527 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 528 nrootreqs = bas->nrootreqs; 529 nleafreqs = sf->nleafreqs; 530 531 /* Look for free links in cache */ 532 for (p=&bas->avail; (link=*p); p=&link->next) { 533 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 534 if (match) { 535 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 536 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 537 */ 538 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 539 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 540 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 541 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 542 } 543 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 544 reqs = link->leafreqs[direction][leafmtype][1]; 545 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 546 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 547 } 548 *p = link->next; /* Remove from available list */ 549 goto found; 550 } 551 } 552 553 ierr = PetscNew(&link);CHKERRQ(ierr); 554 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 555 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 556 557 nreqs = (nrootreqs+nleafreqs)*8; 558 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 559 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 560 561 for (i=0; i<2; i++) { /* Two communication directions */ 562 for (j=0; j<2; j++) { /* Two memory types */ 563 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 564 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 565 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 566 } 567 } 568 } 569 570 found: 571 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 572 573 /* Allocate buffers along root/leafdata */ 574 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 575 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 576 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 577 if (bas->rootbuflen[i]) { 578 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 579 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 580 } else { /* Have to have a separate rootbuf */ 581 if (!link->rootbuf_alloc[i][rootmtype]) { 582 ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 583 } 584 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 585 } 586 } 587 588 if (sf->leafbuflen[i]) { 589 if (leafdirect[i]) { 590 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 591 } else { 592 if (!link->leafbuf_alloc[i][leafmtype]) { 593 ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 594 } 595 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 596 } 597 } 598 } 599 600 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 601 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 602 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 603 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 604 } 605 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 606 } 607 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 608 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 609 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 610 } 611 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 612 } 613 614 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 615 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 616 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 617 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 618 } 619 620 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 621 link->leafdata = leafdata; 622 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 623 link->rootdirect[i] = rootdirect[i]; 624 link->leafdirect[i] = leafdirect[i]; 625 } 626 link->rootdirect_mpi = rootdirect_mpi; 627 link->leafdirect_mpi = leafdirect_mpi; 628 link->rootmtype = rootmtype; 629 link->leafmtype = leafmtype; 630 link->rootmtype_mpi = rootmtype_mpi; 631 link->leafmtype_mpi = leafmtype_mpi; 632 633 link->next = bas->inuse; 634 bas->inuse = link; 635 *mylink = link; 636 PetscFunctionReturn(0); 637 } 638 639 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 640 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 641 */ 642 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 643 { 644 PetscErrorCode ierr; 645 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 646 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 647 const PetscInt *rootoffset,*leafoffset; 648 PetscMPIInt n; 649 MPI_Aint disp; 650 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 651 MPI_Datatype unit = link->unit; 652 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 653 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 654 655 PetscFunctionBegin; 656 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 657 if (sf->persistent) { 658 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 659 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 660 if (direction == PETSCSF_LEAF2ROOT) { 661 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 662 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 663 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 664 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 665 } 666 } else { /* PETSCSF_ROOT2LEAF */ 667 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 668 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 669 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 670 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 671 } 672 } 673 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 674 } 675 676 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 677 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 678 if (direction == PETSCSF_LEAF2ROOT) { 679 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 680 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 681 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 682 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 683 } 684 } else { /* PETSCSF_ROOT2LEAF */ 685 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 686 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 687 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 688 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 689 } 690 } 691 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 692 } 693 } 694 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 695 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 696 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 697 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 698 PetscFunctionReturn(0); 699 } 700 701 702 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 703 { 704 PetscErrorCode ierr; 705 PetscSFLink link,*p; 706 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 707 708 PetscFunctionBegin; 709 /* Look for types in cache */ 710 for (p=&bas->inuse; (link=*p); p=&link->next) { 711 PetscBool match; 712 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 713 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 714 switch (cmode) { 715 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 716 case PETSC_USE_POINTER: break; 717 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 718 } 719 *mylink = link; 720 PetscFunctionReturn(0); 721 } 722 } 723 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 724 PetscFunctionReturn(0); 725 } 726 727 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 728 { 729 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 730 731 PetscFunctionBegin; 732 (*link)->rootdata = NULL; 733 (*link)->leafdata = NULL; 734 (*link)->next = bas->avail; 735 bas->avail = *link; 736 *link = NULL; 737 PetscFunctionReturn(0); 738 } 739 740 /* Destroy all links chained in 'avail' */ 741 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 742 { 743 PetscErrorCode ierr; 744 PetscSFLink link = *avail,next; 745 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 746 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 747 748 PetscFunctionBegin; 749 for (; link; link=next) { 750 next = link->next; 751 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 752 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 753 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 754 } 755 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 756 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 757 #if defined(PETSC_HAVE_CUDA) 758 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 759 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 760 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 761 #endif 762 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 763 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 764 } 765 ierr = PetscFree(link);CHKERRQ(ierr); 766 } 767 *avail = NULL; 768 PetscFunctionReturn(0); 769 } 770 771 /* Error out on unsupported overlapped communications */ 772 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 773 { 774 PetscErrorCode ierr; 775 PetscSFLink link,*p; 776 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 777 PetscBool match; 778 779 PetscFunctionBegin; 780 if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0); 781 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 782 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 783 */ 784 if (rootdata || leafdata) { 785 for (p=&bas->inuse; (link=*p); p=&link->next) { 786 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 787 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 788 } 789 } 790 PetscFunctionReturn(0); 791 } 792 793 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 794 { 795 PetscErrorCode ierr; 796 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 797 PetscBool is2Int,is2PetscInt; 798 PetscMPIInt ni,na,nd,combiner; 799 #if defined(PETSC_HAVE_COMPLEX) 800 PetscInt nPetscComplex=0; 801 #endif 802 803 PetscFunctionBegin; 804 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 805 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 806 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 807 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 808 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 809 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 810 #if defined(PETSC_HAVE_COMPLEX) 811 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 812 #endif 813 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 814 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 815 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 816 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 817 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 818 link->bs = 1; /* default */ 819 820 if (is2Int) { 821 PackInit_PairType_int_int_1_1(link); 822 link->bs = 1; 823 link->unitbytes = 2*sizeof(int); 824 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 825 link->basicunit = MPI_2INT; 826 link->unit = MPI_2INT; 827 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 828 PackInit_PairType_PetscInt_PetscInt_1_1(link); 829 link->bs = 1; 830 link->unitbytes = 2*sizeof(PetscInt); 831 link->basicunit = MPIU_2INT; 832 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 833 link->unit = MPIU_2INT; 834 } else if (nPetscReal) { 835 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 836 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 837 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 838 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 839 link->bs = nPetscReal; 840 link->unitbytes = nPetscReal*sizeof(PetscReal); 841 link->basicunit = MPIU_REAL; 842 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 843 } else if (nPetscInt) { 844 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 845 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 846 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 847 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 848 link->bs = nPetscInt; 849 link->unitbytes = nPetscInt*sizeof(PetscInt); 850 link->basicunit = MPIU_INT; 851 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 852 #if defined(PETSC_USE_64BIT_INDICES) 853 } else if (nInt) { 854 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 855 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 856 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 857 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 858 link->bs = nInt; 859 link->unitbytes = nInt*sizeof(int); 860 link->basicunit = MPI_INT; 861 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 862 #endif 863 } else if (nSignedChar) { 864 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 865 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 866 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 867 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 868 link->bs = nSignedChar; 869 link->unitbytes = nSignedChar*sizeof(SignedChar); 870 link->basicunit = MPI_SIGNED_CHAR; 871 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 872 } else if (nUnsignedChar) { 873 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 874 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 875 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 876 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 877 link->bs = nUnsignedChar; 878 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 879 link->basicunit = MPI_UNSIGNED_CHAR; 880 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 881 #if defined(PETSC_HAVE_COMPLEX) 882 } else if (nPetscComplex) { 883 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 884 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 885 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 886 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 887 link->bs = nPetscComplex; 888 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 889 link->basicunit = MPIU_COMPLEX; 890 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 891 #endif 892 } else { 893 MPI_Aint lb,nbyte; 894 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 895 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 896 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 897 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 898 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 899 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 900 link->bs = nbyte; 901 link->unitbytes = nbyte; 902 link->basicunit = MPI_BYTE; 903 } else { 904 nInt = nbyte / sizeof(int); 905 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 906 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 907 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 908 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 909 link->bs = nInt; 910 link->unitbytes = nbyte; 911 link->basicunit = MPI_INT; 912 } 913 if (link->isbuiltin) link->unit = unit; 914 } 915 916 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 917 PetscFunctionReturn(0); 918 } 919 920 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 921 { 922 PetscFunctionBegin; 923 *UnpackAndOp = NULL; 924 if (mtype == PETSC_MEMTYPE_HOST) { 925 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 926 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 927 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 928 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 929 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 930 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 931 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 932 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 933 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 934 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 935 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 936 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 937 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 938 } 939 #if defined(PETSC_HAVE_CUDA) 940 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 941 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 942 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 943 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 944 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 945 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 946 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 947 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 948 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 949 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 950 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 951 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 952 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 953 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 954 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 955 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 956 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 957 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 958 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 959 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 960 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 961 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 962 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 963 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 964 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 965 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 966 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 967 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 968 } 969 #endif 970 PetscFunctionReturn(0); 971 } 972 973 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 974 { 975 PetscFunctionBegin; 976 *ScatterAndOp = NULL; 977 if (mtype == PETSC_MEMTYPE_HOST) { 978 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 979 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 980 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 981 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 982 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 983 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 984 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 985 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 986 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 987 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 988 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 989 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 990 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 991 } 992 #if defined(PETSC_HAVE_CUDA) 993 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 994 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 995 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 996 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 997 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 998 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 999 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1000 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1001 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1002 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1003 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1004 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1005 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1006 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1007 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1008 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1009 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1010 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1011 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1012 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1013 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1014 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1015 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1016 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1017 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1018 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1019 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1020 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1021 } 1022 #endif 1023 PetscFunctionReturn(0); 1024 } 1025 1026 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1027 { 1028 PetscFunctionBegin; 1029 *FetchAndOp = NULL; 1030 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1031 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1032 #if defined(PETSC_HAVE_CUDA) 1033 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1034 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1035 #endif 1036 PetscFunctionReturn(0); 1037 } 1038 1039 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1040 { 1041 PetscFunctionBegin; 1042 *FetchAndOpLocal = NULL; 1043 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1044 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1045 #if defined(PETSC_HAVE_CUDA) 1046 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1047 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1048 #endif 1049 PetscFunctionReturn(0); 1050 } 1051 1052 /*============================================================================= 1053 A set of helper routines for Pack/Unpack/Scatter on GPUs 1054 ============================================================================*/ 1055 #if defined(PETSC_HAVE_CUDA) 1056 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1057 make sure the data is ready for packing. 1058 */ 1059 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1060 { 1061 PetscFunctionBegin; 1062 if (sf->use_default_stream) PetscFunctionReturn(0); 1063 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1064 cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1065 } 1066 PetscFunctionReturn(0); 1067 } 1068 1069 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1070 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1071 { 1072 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1073 1074 PetscFunctionBegin; 1075 /* Do nothing if we use stream aware mpi || has nothing for remote */ 1076 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1077 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 1078 if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) { 1079 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1080 } 1081 PetscFunctionReturn(0); 1082 } 1083 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1084 { 1085 PetscFunctionBegin; 1086 /* See comments above */ 1087 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1088 if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) { 1089 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1090 } 1091 PetscFunctionReturn(0); 1092 } 1093 1094 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1095 does not know which stream the callers will use. 1096 */ 1097 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1098 { 1099 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1100 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 1101 1102 PetscFunctionBegin; 1103 /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 1104 if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 1105 1106 /* If rootmtype is HOST or DEVICE: 1107 If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 1108 If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 1109 i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 1110 1111 Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 1112 CPU thread might use the yet-to-be-updated rootdata pending in the stream. 1113 */ 1114 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) { 1115 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1116 } 1117 PetscFunctionReturn(0); 1118 } 1119 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1120 { 1121 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 1122 1123 PetscFunctionBegin; 1124 /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 1125 if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 1126 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) { 1127 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1128 } 1129 PetscFunctionReturn(0); 1130 } 1131 1132 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1133 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1134 */ 1135 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1136 { 1137 PetscErrorCode ierr; 1138 cudaError_t cerr; 1139 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1140 1141 PetscFunctionBegin; 1142 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1143 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1144 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1145 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1146 if (device2host) { 1147 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1148 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1149 } else { 1150 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1151 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1152 } 1153 } 1154 PetscFunctionReturn(0); 1155 } 1156 1157 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1158 { 1159 PetscErrorCode ierr; 1160 cudaError_t cerr; 1161 1162 PetscFunctionBegin; 1163 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1164 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1165 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1166 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1167 if (device2host) { 1168 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1169 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1170 } else { 1171 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1172 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1173 } 1174 } 1175 PetscFunctionReturn(0); 1176 } 1177 #else 1178 1179 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1180 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1181 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1182 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1183 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1184 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1185 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1186 1187 #endif 1188 1189 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1190 { 1191 PetscErrorCode ierr; 1192 PetscLogDouble flops; 1193 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1194 1195 1196 PetscFunctionBegin; 1197 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1198 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1199 #if defined(PETSC_HAVE_CUDA) 1200 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1201 #endif 1202 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1203 } 1204 PetscFunctionReturn(0); 1205 } 1206 1207 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1208 { 1209 PetscLogDouble flops; 1210 PetscErrorCode ierr; 1211 1212 PetscFunctionBegin; 1213 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1214 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1215 #if defined(PETSC_HAVE_CUDA) 1216 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1217 #endif 1218 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1219 } 1220 PetscFunctionReturn(0); 1221 } 1222 1223 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1224 Input Arguments: 1225 +sf - The StarForest 1226 .link - The link 1227 .count - Number of entries to unpack 1228 .start - The first index, significent when indices=NULL 1229 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1230 .buf - A contiguous buffer to unpack from 1231 -op - Operation after unpack 1232 1233 Output Arguments: 1234 .data - The data to unpack to 1235 */ 1236 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1237 { 1238 PetscFunctionBegin; 1239 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1240 { 1241 PetscErrorCode ierr; 1242 PetscInt i; 1243 PetscMPIInt n; 1244 if (indices) { 1245 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1246 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1247 */ 1248 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1249 } else { 1250 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1251 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1252 } 1253 } 1254 #else 1255 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1256 #endif 1257 PetscFunctionReturn(0); 1258 } 1259 1260 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1261 { 1262 PetscFunctionBegin; 1263 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1264 { 1265 PetscErrorCode ierr; 1266 PetscInt i,disp; 1267 if (!srcIdx) { 1268 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1269 } else { 1270 for (i=0; i<count; i++) { 1271 disp = dstIdx? dstIdx[i] : dstStart + i; 1272 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1273 } 1274 } 1275 } 1276 #else 1277 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1278 #endif 1279 PetscFunctionReturn(0); 1280 } 1281 1282 /*============================================================================= 1283 Pack/Unpack/Fetch/Scatter routines 1284 ============================================================================*/ 1285 1286 /* Pack rootdata to rootbuf 1287 Input Arguments: 1288 + sf - The SF this packing works on. 1289 . link - It gives the memtype of the roots and also provides root buffer. 1290 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1291 - rootdata - Where to read the roots. 1292 1293 Notes: 1294 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1295 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1296 */ 1297 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1298 { 1299 PetscErrorCode ierr; 1300 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1301 const PetscInt *rootindices = NULL; 1302 PetscInt count,start; 1303 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1304 PetscMemType rootmtype = link->rootmtype; 1305 PetscSFPackOpt opt = NULL; 1306 1307 1308 PetscFunctionBegin; 1309 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1310 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1311 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1312 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1313 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1314 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1315 } 1316 if (scope == PETSCSF_REMOTE) { 1317 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1318 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1319 } 1320 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1321 PetscFunctionReturn(0); 1322 } 1323 1324 /* Pack leafdata to leafbuf */ 1325 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1326 { 1327 PetscErrorCode ierr; 1328 const PetscInt *leafindices = NULL; 1329 PetscInt count,start; 1330 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1331 PetscMemType leafmtype = link->leafmtype; 1332 PetscSFPackOpt opt = NULL; 1333 1334 PetscFunctionBegin; 1335 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1336 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1337 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1338 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1339 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1340 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1341 } 1342 if (scope == PETSCSF_REMOTE) { 1343 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1344 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1345 } 1346 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1347 PetscFunctionReturn(0); 1348 } 1349 1350 /* Unpack rootbuf to rootdata */ 1351 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1352 { 1353 PetscErrorCode ierr; 1354 const PetscInt *rootindices = NULL; 1355 PetscInt count,start; 1356 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1357 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1358 PetscMemType rootmtype = link->rootmtype; 1359 PetscSFPackOpt opt = NULL; 1360 1361 PetscFunctionBegin; 1362 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1363 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1364 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1365 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1366 if (UnpackAndOp) { 1367 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1368 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1369 } else { 1370 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1371 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1372 } 1373 } 1374 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1375 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1376 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 /* Unpack leafbuf to leafdata */ 1381 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1382 { 1383 PetscErrorCode ierr; 1384 const PetscInt *leafindices = NULL; 1385 PetscInt count,start; 1386 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1387 PetscMemType leafmtype = link->leafmtype; 1388 PetscSFPackOpt opt = NULL; 1389 1390 PetscFunctionBegin; 1391 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1392 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1393 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1394 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1395 if (UnpackAndOp) { 1396 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1397 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1398 } else { 1399 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1400 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1401 } 1402 } 1403 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1404 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1405 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408 1409 /* FetchAndOp rootdata with rootbuf */ 1410 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1411 { 1412 PetscErrorCode ierr; 1413 const PetscInt *rootindices = NULL; 1414 PetscInt count,start; 1415 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1416 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1417 PetscMemType rootmtype = link->rootmtype; 1418 PetscSFPackOpt opt = NULL; 1419 1420 PetscFunctionBegin; 1421 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1422 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1423 if (bas->rootbuflen[scope]) { 1424 /* Do FetchAndOp on rootdata with rootbuf */ 1425 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1426 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1427 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1428 } 1429 if (scope == PETSCSF_REMOTE) { 1430 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1431 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1432 } 1433 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1434 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1435 PetscFunctionReturn(0); 1436 } 1437 1438 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1439 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1440 { 1441 PetscErrorCode ierr; 1442 const PetscInt *rootindices = NULL,*leafindices = NULL; 1443 PetscInt count,rootstart,leafstart; 1444 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1445 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1446 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1447 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1448 1449 PetscFunctionBegin; 1450 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1451 if (rootmtype != leafmtype) { /* Uncommon case */ 1452 /* The local communication has to go through pack and unpack */ 1453 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1454 ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1455 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1456 } else { 1457 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1458 if (ScatterAndOp) { 1459 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1460 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1461 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1462 } else { 1463 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1464 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1465 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1466 } 1467 } 1468 PetscFunctionReturn(0); 1469 } 1470 1471 /* Reduce leafdata to rootdata locally */ 1472 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1473 { 1474 PetscErrorCode ierr; 1475 const PetscInt *rootindices = NULL,*leafindices = NULL; 1476 PetscInt count,rootstart,leafstart; 1477 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1478 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1479 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1480 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1481 1482 PetscFunctionBegin; 1483 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1484 if (rootmtype != leafmtype) { 1485 /* The local communication has to go through pack and unpack */ 1486 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1487 ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1488 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1489 } else { 1490 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1491 if (ScatterAndOp) { 1492 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1493 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1494 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1495 } else { 1496 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1497 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1498 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1499 } 1500 } 1501 PetscFunctionReturn(0); 1502 } 1503 1504 /* Fetch rootdata to leafdata and leafupdate locally */ 1505 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1506 { 1507 PetscErrorCode ierr; 1508 const PetscInt *rootindices = NULL,*leafindices = NULL; 1509 PetscInt count,rootstart,leafstart; 1510 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1511 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1512 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1513 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1514 1515 PetscFunctionBegin; 1516 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1517 if (rootmtype != leafmtype) { 1518 /* The local communication has to go through pack and unpack */ 1519 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1520 } else { 1521 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1522 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1523 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1524 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1525 } 1526 PetscFunctionReturn(0); 1527 } 1528 1529 /* 1530 Create per-rank pack/unpack optimizations based on indice patterns 1531 1532 Input Parameters: 1533 + n - Number of destination ranks 1534 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1535 - idx - [*] Array storing indices 1536 1537 Output Parameters: 1538 + opt - Pack optimizations. NULL if no optimizations. 1539 */ 1540 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1541 { 1542 PetscErrorCode ierr; 1543 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1544 PetscBool optimizable = PETSC_TRUE; 1545 PetscSFPackOpt opt; 1546 1547 PetscFunctionBegin; 1548 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1549 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1550 opt->n = opt->array[0] = n; 1551 opt->offset = opt->array + 1; 1552 opt->start = opt->array + n + 2; 1553 opt->dx = opt->array + 2*n + 2; 1554 opt->dy = opt->array + 3*n + 2; 1555 opt->dz = opt->array + 4*n + 2; 1556 opt->X = opt->array + 5*n + 2; 1557 opt->Y = opt->array + 6*n + 2; 1558 1559 for (r=0; r<n; r++) { /* For each destination rank */ 1560 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1561 p = offset[r]; 1562 start = idx[p]; /* First index for this rank */ 1563 p++; 1564 1565 /* Search in X dimension */ 1566 for (dx=1; dx<m; dx++,p++) { 1567 if (start+dx != idx[p]) break; 1568 } 1569 1570 dydz = m/dx; 1571 X = dydz > 1 ? (idx[p]-start) : dx; 1572 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1573 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1574 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1575 for (i=0; i<dx; i++,p++) { 1576 if (start+X*dy+i != idx[p]) { 1577 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1578 else goto Z_dimension; 1579 } 1580 } 1581 } 1582 1583 Z_dimension: 1584 dz = m/(dx*dy); 1585 Y = dz > 1 ? (idx[p]-start)/X : dy; 1586 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1587 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1588 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1589 for (j=0; j<dy; j++) { 1590 for (i=0; i<dx; i++,p++) { 1591 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1592 } 1593 } 1594 } 1595 opt->start[r] = start; 1596 opt->dx[r] = dx; 1597 opt->dy[r] = dy; 1598 opt->dz[r] = dz; 1599 opt->X[r] = X; 1600 opt->Y[r] = Y; 1601 } 1602 1603 finish: 1604 /* If not optimizable, free arrays to save memory */ 1605 if (!n || !optimizable) { 1606 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1607 ierr = PetscFree(opt);CHKERRQ(ierr); 1608 *out = NULL; 1609 } else { 1610 opt->offset[0] = 0; 1611 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1612 *out = opt; 1613 } 1614 PetscFunctionReturn(0); 1615 } 1616 1617 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out) 1618 { 1619 PetscErrorCode ierr; 1620 PetscSFPackOpt opt = *out; 1621 1622 PetscFunctionBegin; 1623 if (opt) { 1624 if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);} 1625 #if defined(PETSC_HAVE_CUDA) 1626 else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;} 1627 #endif 1628 ierr = PetscFree(opt);CHKERRQ(ierr); 1629 *out = NULL; 1630 } 1631 PetscFunctionReturn(0); 1632 } 1633 1634 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1635 { 1636 PetscErrorCode ierr; 1637 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1638 PetscInt i,j; 1639 1640 PetscFunctionBegin; 1641 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1642 for (i=0; i<2; i++) { /* Set defaults */ 1643 sf->leafstart[i] = 0; 1644 sf->leafcontig[i] = PETSC_TRUE; 1645 sf->leafdups[i] = PETSC_FALSE; 1646 bas->rootstart[i] = 0; 1647 bas->rootcontig[i] = PETSC_TRUE; 1648 bas->rootdups[i] = PETSC_FALSE; 1649 } 1650 1651 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1652 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1653 1654 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1655 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1656 1657 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1658 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1659 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1660 } 1661 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1662 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1663 } 1664 1665 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1666 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1667 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1668 1669 /* Are root indices for self and remote contiguous? */ 1670 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1671 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1672 1673 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1674 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1675 1676 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1677 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1678 } 1679 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1680 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1681 } 1682 1683 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1684 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1685 1686 #if defined(PETSC_HAVE_CUDA) 1687 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1688 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1689 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1690 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1691 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1692 #endif 1693 1694 PetscFunctionReturn(0); 1695 } 1696 1697 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1698 { 1699 PetscErrorCode ierr; 1700 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1701 PetscInt i; 1702 1703 PetscFunctionBegin; 1704 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1705 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1706 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1707 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1708 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1709 } 1710 PetscFunctionReturn(0); 1711 } 1712