1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 3 4 #if defined(PETSC_HAVE_CUDA) 5 #include <cuda_runtime.h> 6 #endif 7 /* 8 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 9 * therefore we pack data types manually. This file defines packing routines for the standard data types. 10 */ 11 12 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 13 14 /* Operations working like s += t */ 15 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 17 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 18 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 19 /* Ref MPI MAXLOC */ 20 #define OP_XLOC(op,s,t) \ 21 do { \ 22 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23 else if (!((s).u op (t).u)) s = t; \ 24 } while(0) 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 .BS Block size for vectorization. It is a factor of bsz. 31 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 32 33 Arguments of the Pack routine: 34 +count Number of indices in idx[]. 35 .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36 .opt Per-pack optimization plan. NULL means no such plan. 37 .idx Indices of entries to packed. 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40 -packed Address of the packed data. 41 */ 42 #define DEF_PackFunc(Type,BS,EQ) \ 43 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44 { \ 45 PetscErrorCode ierr; \ 46 const Type *u = (const Type*)unpacked,*u2; \ 47 Type *p = (Type*)packed,*p2; \ 48 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 49 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 50 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 51 PetscFunctionBegin; \ 52 if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 53 else if (opt) { /* has optimizations available */ \ 54 p2 = p; \ 55 for (r=0; r<opt->n; r++) { \ 56 u2 = u + opt->start[r]*MBS; \ 57 X = opt->X[r]; \ 58 Y = opt->Y[r]; \ 59 for (k=0; k<opt->dz[r]; k++) \ 60 for (j=0; j<opt->dy[r]; j++) { \ 61 ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 62 p2 += opt->dx[r]*MBS; \ 63 } \ 64 } \ 65 } else { \ 66 for (i=0; i<count; i++) \ 67 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 68 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 69 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 70 } \ 71 PetscFunctionReturn(0); \ 72 } 73 74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 75 and inserts into a sparse array. 76 77 Arguments: 78 .Type Type of the data 79 .BS Block size for vectorization 80 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 81 82 Notes: 83 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 84 */ 85 #define DEF_UnpackFunc(Type,BS,EQ) \ 86 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 87 { \ 88 PetscErrorCode ierr; \ 89 Type *u = (Type*)unpacked,*u2; \ 90 const Type *p = (const Type*)packed; \ 91 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 92 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 93 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 94 PetscFunctionBegin; \ 95 if (!idx) { \ 96 u += start*MBS; \ 97 if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 98 } else if (opt) { /* has optimizations available */ \ 99 for (r=0; r<opt->n; r++) { \ 100 u2 = u + opt->start[r]*MBS; \ 101 X = opt->X[r]; \ 102 Y = opt->Y[r]; \ 103 for (k=0; k<opt->dz[r]; k++) \ 104 for (j=0; j<opt->dy[r]; j++) { \ 105 ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 106 p += opt->dx[r]*MBS; \ 107 } \ 108 } \ 109 } else { \ 110 for (i=0; i<count; i++) \ 111 for (j=0; j<M; j++) \ 112 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 113 } \ 114 PetscFunctionReturn(0); \ 115 } 116 117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 118 119 Arguments: 120 +Opname Name of the Op, such as Add, Mult, LAND, etc. 121 .Type Type of the data 122 .BS Block size for vectorization 123 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 124 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 125 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 126 */ 127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 128 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 129 { \ 130 Type *u = (Type*)unpacked,*u2; \ 131 const Type *p = (const Type*)packed; \ 132 PetscInt i,j,k,X,Y,r,bs=link->bs; \ 133 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 134 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 135 PetscFunctionBegin; \ 136 if (!idx) { \ 137 u += start*MBS; \ 138 for (i=0; i<count; i++) \ 139 for (j=0; j<M; j++) \ 140 for (k=0; k<BS; k++) \ 141 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 142 } else if (opt) { /* idx[] has patterns */ \ 143 for (r=0; r<opt->n; r++) { \ 144 u2 = u + opt->start[r]*MBS; \ 145 X = opt->X[r]; \ 146 Y = opt->Y[r]; \ 147 for (k=0; k<opt->dz[r]; k++) \ 148 for (j=0; j<opt->dy[r]; j++) { \ 149 for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 150 p += opt->dx[r]*MBS; \ 151 } \ 152 } \ 153 } else { \ 154 for (i=0; i<count; i++) \ 155 for (j=0; j<M; j++) \ 156 for (k=0; k<BS; k++) \ 157 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 158 } \ 159 PetscFunctionReturn(0); \ 160 } 161 162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 163 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 164 { \ 165 Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 166 PetscInt i,j,k,r,l,bs=link->bs; \ 167 const PetscInt M = (EQ) ? 1 : bs/BS; \ 168 const PetscInt MBS = M*BS; \ 169 PetscFunctionBegin; \ 170 for (i=0; i<count; i++) { \ 171 r = (!idx ? start+i : idx[i])*MBS; \ 172 l = i*MBS; \ 173 for (j=0; j<M; j++) \ 174 for (k=0; k<BS; k++) { \ 175 tmp = u[r+j*BS+k]; \ 176 OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 177 p[l+j*BS+k] = tmp; \ 178 } \ 179 } \ 180 PetscFunctionReturn(0); \ 181 } 182 183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 184 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 185 { \ 186 PetscErrorCode ierr; \ 187 const Type *u = (const Type*)src; \ 188 Type *v = (Type*)dst; \ 189 PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 190 const PetscInt M = (EQ) ? 1 : bs/BS; \ 191 const PetscInt MBS = M*BS; \ 192 PetscFunctionBegin; \ 193 if (!srcIdx) { /* src is contiguous */ \ 194 u += srcStart*MBS; \ 195 ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 196 } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 197 u += srcOpt->start[0]*MBS; \ 198 v += dstStart*MBS; \ 199 X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 200 for (k=0; k<srcOpt->dz[0]; k++) \ 201 for (j=0; j<srcOpt->dy[0]; j++) { \ 202 for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 203 v += srcOpt->dx[0]*MBS; \ 204 } \ 205 } else { /* all other cases */ \ 206 for (i=0; i<count; i++) { \ 207 s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 208 t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 209 for (j=0; j<M; j++) \ 210 for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 211 } \ 212 } \ 213 PetscFunctionReturn(0); \ 214 } 215 216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 217 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 218 { \ 219 Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 220 const Type *ldata = (const Type*)leafdata; \ 221 PetscInt i,j,k,r,l,bs = link->bs; \ 222 const PetscInt M = (EQ) ? 1 : bs/BS; \ 223 const PetscInt MBS = M*BS; \ 224 PetscFunctionBegin; \ 225 for (i=0; i<count; i++) { \ 226 r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 227 l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 228 for (j=0; j<M; j++) \ 229 for (k=0; k<BS; k++) { \ 230 lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 231 OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 232 } \ 233 } \ 234 PetscFunctionReturn(0); \ 235 } 236 237 /* Pack, Unpack/Fetch ops */ 238 #define DEF_Pack(Type,BS,EQ) \ 239 DEF_PackFunc(Type,BS,EQ) \ 240 DEF_UnpackFunc(Type,BS,EQ) \ 241 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 242 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 243 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 244 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 245 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 246 } 247 248 /* Add, Mult ops */ 249 #define DEF_Add(Type,BS,EQ) \ 250 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 253 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 256 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 257 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 258 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 259 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 260 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 261 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 262 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 263 } 264 265 /* Max, Min ops */ 266 #define DEF_Cmp(Type,BS,EQ) \ 267 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 268 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 269 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 270 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 271 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 272 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 273 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 274 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 275 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 276 } 277 278 /* Logical ops. 279 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 280 the compilation warning "empty macro arguments are undefined in ISO C90" 281 */ 282 #define DEF_Log(Type,BS,EQ) \ 283 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 284 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 285 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 286 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 287 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 288 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 289 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 291 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 292 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 293 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 294 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 295 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 296 } 297 298 /* Bitwise ops */ 299 #define DEF_Bit(Type,BS,EQ) \ 300 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 301 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 302 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 303 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 304 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 305 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 306 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 307 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 308 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 309 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 310 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 311 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 312 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 313 } 314 315 /* Maxloc, Minloc ops */ 316 #define DEF_Xloc(Type,BS,EQ) \ 317 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 318 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 319 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 320 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 321 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 322 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 323 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 324 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 325 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 326 } 327 328 #define DEF_IntegerType(Type,BS,EQ) \ 329 DEF_Pack(Type,BS,EQ) \ 330 DEF_Add(Type,BS,EQ) \ 331 DEF_Cmp(Type,BS,EQ) \ 332 DEF_Log(Type,BS,EQ) \ 333 DEF_Bit(Type,BS,EQ) \ 334 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 335 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 336 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 337 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 338 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 339 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 340 } 341 342 #define DEF_RealType(Type,BS,EQ) \ 343 DEF_Pack(Type,BS,EQ) \ 344 DEF_Add(Type,BS,EQ) \ 345 DEF_Cmp(Type,BS,EQ) \ 346 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 347 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 348 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 349 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 350 } 351 352 #if defined(PETSC_HAVE_COMPLEX) 353 #define DEF_ComplexType(Type,BS,EQ) \ 354 DEF_Pack(Type,BS,EQ) \ 355 DEF_Add(Type,BS,EQ) \ 356 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 357 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 358 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 359 } 360 #endif 361 362 #define DEF_DumbType(Type,BS,EQ) \ 363 DEF_Pack(Type,BS,EQ) \ 364 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 365 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366 } 367 368 /* Maxloc, Minloc */ 369 #define DEF_PairType(Type,BS,EQ) \ 370 DEF_Pack(Type,BS,EQ) \ 371 DEF_Xloc(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 375 } 376 377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 385 386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 387 DEF_IntegerType(int,1,1) 388 DEF_IntegerType(int,2,1) 389 DEF_IntegerType(int,4,1) 390 DEF_IntegerType(int,8,1) 391 DEF_IntegerType(int,1,0) 392 DEF_IntegerType(int,2,0) 393 DEF_IntegerType(int,4,0) 394 DEF_IntegerType(int,8,0) 395 #endif 396 397 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 398 typedef signed char SignedChar; 399 DEF_IntegerType(SignedChar,1,1) 400 DEF_IntegerType(SignedChar,2,1) 401 DEF_IntegerType(SignedChar,4,1) 402 DEF_IntegerType(SignedChar,8,1) 403 DEF_IntegerType(SignedChar,1,0) 404 DEF_IntegerType(SignedChar,2,0) 405 DEF_IntegerType(SignedChar,4,0) 406 DEF_IntegerType(SignedChar,8,0) 407 408 typedef unsigned char UnsignedChar; 409 DEF_IntegerType(UnsignedChar,1,1) 410 DEF_IntegerType(UnsignedChar,2,1) 411 DEF_IntegerType(UnsignedChar,4,1) 412 DEF_IntegerType(UnsignedChar,8,1) 413 DEF_IntegerType(UnsignedChar,1,0) 414 DEF_IntegerType(UnsignedChar,2,0) 415 DEF_IntegerType(UnsignedChar,4,0) 416 DEF_IntegerType(UnsignedChar,8,0) 417 418 DEF_RealType(PetscReal,1,1) 419 DEF_RealType(PetscReal,2,1) 420 DEF_RealType(PetscReal,4,1) 421 DEF_RealType(PetscReal,8,1) 422 DEF_RealType(PetscReal,1,0) 423 DEF_RealType(PetscReal,2,0) 424 DEF_RealType(PetscReal,4,0) 425 DEF_RealType(PetscReal,8,0) 426 427 #if defined(PETSC_HAVE_COMPLEX) 428 DEF_ComplexType(PetscComplex,1,1) 429 DEF_ComplexType(PetscComplex,2,1) 430 DEF_ComplexType(PetscComplex,4,1) 431 DEF_ComplexType(PetscComplex,8,1) 432 DEF_ComplexType(PetscComplex,1,0) 433 DEF_ComplexType(PetscComplex,2,0) 434 DEF_ComplexType(PetscComplex,4,0) 435 DEF_ComplexType(PetscComplex,8,0) 436 #endif 437 438 #define PairType(Type1,Type2) Type1##_##Type2 439 typedef struct {int u; int i;} PairType(int,int); 440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 441 DEF_PairType(PairType(int,int),1,1) 442 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 443 444 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 445 DEF_DumbType(char,1,1) 446 DEF_DumbType(char,2,1) 447 DEF_DumbType(char,4,1) 448 DEF_DumbType(char,1,0) 449 DEF_DumbType(char,2,0) 450 DEF_DumbType(char,4,0) 451 452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 453 DEF_DumbType(DumbInt,1,1) 454 DEF_DumbType(DumbInt,2,1) 455 DEF_DumbType(DumbInt,4,1) 456 DEF_DumbType(DumbInt,8,1) 457 DEF_DumbType(DumbInt,1,0) 458 DEF_DumbType(DumbInt,2,0) 459 DEF_DumbType(DumbInt,4,0) 460 DEF_DumbType(DumbInt,8,0) 461 462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 464 { 465 int ierr; 466 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 467 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 468 return MPI_SUCCESS; 469 } 470 #endif 471 472 /* 473 The routine Creates a communication link for the given operation. It first looks up its link cache. If 474 there is a free & suitable one, it uses it. Otherwise it creates a new one. 475 476 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 477 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 478 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 479 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 480 481 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 482 483 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 484 485 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 486 need pack/unpack data. 487 */ 488 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 489 { 490 PetscErrorCode ierr; 491 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 492 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 493 PetscSFLink *p,link; 494 PetscSFDirection direction; 495 MPI_Request *reqs = NULL; 496 PetscBool match,rootdirect[2],leafdirect[2]; 497 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 498 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 499 500 PetscFunctionBegin; 501 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 502 503 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 504 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 505 if (sfop == PETSCSF_BCAST) { 506 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 507 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 508 } else if (sfop == PETSCSF_REDUCE) { 509 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 510 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 511 } else { /* PETSCSF_FETCH */ 512 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 513 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 514 } 515 } 516 517 if (sf->use_gpu_aware_mpi) { 518 rootmtype_mpi = rootmtype; 519 leafmtype_mpi = leafmtype; 520 } else { 521 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 522 } 523 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 524 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 525 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 526 527 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 528 nrootreqs = bas->nrootreqs; 529 nleafreqs = sf->nleafreqs; 530 531 /* Look for free links in cache */ 532 for (p=&bas->avail; (link=*p); p=&link->next) { 533 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 534 if (match) { 535 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 536 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 537 */ 538 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 539 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 540 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 541 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 542 } 543 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 544 reqs = link->leafreqs[direction][leafmtype][1]; 545 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 546 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 547 } 548 *p = link->next; /* Remove from available list */ 549 goto found; 550 } 551 } 552 553 ierr = PetscNew(&link);CHKERRQ(ierr); 554 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 555 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 556 557 nreqs = (nrootreqs+nleafreqs)*8; 558 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 559 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 560 561 for (i=0; i<2; i++) { /* Two communication directions */ 562 for (j=0; j<2; j++) { /* Two memory types */ 563 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 564 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 565 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 566 } 567 } 568 } 569 570 found: 571 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 572 573 /* Allocate buffers along root/leafdata */ 574 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 575 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 576 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 577 if (bas->rootbuflen[i]) { 578 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 579 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 580 } else { /* Have to have a separate rootbuf */ 581 if (!link->rootbuf_alloc[i][rootmtype]) { 582 ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 583 } 584 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 585 } 586 } 587 588 if (sf->leafbuflen[i]) { 589 if (leafdirect[i]) { 590 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 591 } else { 592 if (!link->leafbuf_alloc[i][leafmtype]) { 593 ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 594 } 595 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 596 } 597 } 598 } 599 600 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 601 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 602 if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 603 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 604 } 605 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 606 } 607 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 608 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 609 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 610 } 611 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 612 } 613 614 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 615 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 616 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 617 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 618 } 619 620 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 621 link->leafdata = leafdata; 622 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 623 link->rootdirect[i] = rootdirect[i]; 624 link->leafdirect[i] = leafdirect[i]; 625 } 626 link->rootdirect_mpi = rootdirect_mpi; 627 link->leafdirect_mpi = leafdirect_mpi; 628 link->rootmtype = rootmtype; 629 link->leafmtype = leafmtype; 630 link->rootmtype_mpi = rootmtype_mpi; 631 link->leafmtype_mpi = leafmtype_mpi; 632 633 link->next = bas->inuse; 634 bas->inuse = link; 635 *mylink = link; 636 PetscFunctionReturn(0); 637 } 638 639 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 640 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 641 */ 642 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 643 { 644 PetscErrorCode ierr; 645 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 646 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 647 const PetscInt *rootoffset,*leafoffset; 648 PetscMPIInt n; 649 MPI_Aint disp; 650 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 651 MPI_Datatype unit = link->unit; 652 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 653 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 654 655 PetscFunctionBegin; 656 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 657 if (sf->persistent) { 658 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 659 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 660 if (direction == PETSCSF_LEAF2ROOT) { 661 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 662 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 663 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 664 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 665 } 666 } else { /* PETSCSF_ROOT2LEAF */ 667 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 668 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 669 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 670 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 671 } 672 } 673 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 674 } 675 676 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 677 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 678 if (direction == PETSCSF_LEAF2ROOT) { 679 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 680 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 681 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 682 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 683 } 684 } else { /* PETSCSF_ROOT2LEAF */ 685 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 686 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 687 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 688 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 689 } 690 } 691 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 692 } 693 } 694 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 695 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 696 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 697 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 698 PetscFunctionReturn(0); 699 } 700 701 702 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 703 { 704 PetscErrorCode ierr; 705 PetscSFLink link,*p; 706 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 707 708 PetscFunctionBegin; 709 /* Look for types in cache */ 710 for (p=&bas->inuse; (link=*p); p=&link->next) { 711 PetscBool match; 712 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 713 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 714 switch (cmode) { 715 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 716 case PETSC_USE_POINTER: break; 717 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 718 } 719 *mylink = link; 720 PetscFunctionReturn(0); 721 } 722 } 723 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 724 PetscFunctionReturn(0); 725 } 726 727 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 728 { 729 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 730 731 PetscFunctionBegin; 732 (*link)->rootdata = NULL; 733 (*link)->leafdata = NULL; 734 (*link)->next = bas->avail; 735 bas->avail = *link; 736 *link = NULL; 737 PetscFunctionReturn(0); 738 } 739 740 /* Destroy all links chained in 'avail' */ 741 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 742 { 743 PetscErrorCode ierr; 744 PetscSFLink link = *avail,next; 745 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 746 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 747 748 PetscFunctionBegin; 749 for (; link; link=next) { 750 next = link->next; 751 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 752 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 753 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 754 } 755 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 756 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 757 #if defined(PETSC_HAVE_CUDA) 758 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 759 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 760 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 761 #endif 762 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 763 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 764 } 765 ierr = PetscFree(link);CHKERRQ(ierr); 766 } 767 *avail = NULL; 768 PetscFunctionReturn(0); 769 } 770 771 #if defined(PETSC_USE_DEBUG) 772 /* Error out on unsupported overlapped communications */ 773 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 774 { 775 PetscErrorCode ierr; 776 PetscSFLink link,*p; 777 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 778 PetscBool match; 779 780 PetscFunctionBegin; 781 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 782 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 783 */ 784 if (rootdata || leafdata) { 785 for (p=&bas->inuse; (link=*p); p=&link->next) { 786 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 787 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 788 } 789 } 790 PetscFunctionReturn(0); 791 } 792 #endif 793 794 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 795 { 796 PetscErrorCode ierr; 797 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 798 PetscBool is2Int,is2PetscInt; 799 PetscMPIInt ni,na,nd,combiner; 800 #if defined(PETSC_HAVE_COMPLEX) 801 PetscInt nPetscComplex=0; 802 #endif 803 804 PetscFunctionBegin; 805 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 806 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 807 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 808 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 809 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 810 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 811 #if defined(PETSC_HAVE_COMPLEX) 812 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 813 #endif 814 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 815 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 816 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 817 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 818 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 819 link->bs = 1; /* default */ 820 821 if (is2Int) { 822 PackInit_PairType_int_int_1_1(link); 823 link->bs = 1; 824 link->unitbytes = 2*sizeof(int); 825 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 826 link->basicunit = MPI_2INT; 827 link->unit = MPI_2INT; 828 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 829 PackInit_PairType_PetscInt_PetscInt_1_1(link); 830 link->bs = 1; 831 link->unitbytes = 2*sizeof(PetscInt); 832 link->basicunit = MPIU_2INT; 833 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 834 link->unit = MPIU_2INT; 835 } else if (nPetscReal) { 836 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 837 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 838 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 839 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 840 link->bs = nPetscReal; 841 link->unitbytes = nPetscReal*sizeof(PetscReal); 842 link->basicunit = MPIU_REAL; 843 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 844 } else if (nPetscInt) { 845 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 846 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 847 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 848 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 849 link->bs = nPetscInt; 850 link->unitbytes = nPetscInt*sizeof(PetscInt); 851 link->basicunit = MPIU_INT; 852 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 853 #if defined(PETSC_USE_64BIT_INDICES) 854 } else if (nInt) { 855 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 856 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 857 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 858 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 859 link->bs = nInt; 860 link->unitbytes = nInt*sizeof(int); 861 link->basicunit = MPI_INT; 862 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 863 #endif 864 } else if (nSignedChar) { 865 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 866 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 867 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 868 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 869 link->bs = nSignedChar; 870 link->unitbytes = nSignedChar*sizeof(SignedChar); 871 link->basicunit = MPI_SIGNED_CHAR; 872 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 873 } else if (nUnsignedChar) { 874 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 875 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 876 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 877 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 878 link->bs = nUnsignedChar; 879 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 880 link->basicunit = MPI_UNSIGNED_CHAR; 881 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 882 #if defined(PETSC_HAVE_COMPLEX) 883 } else if (nPetscComplex) { 884 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 885 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 886 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 887 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 888 link->bs = nPetscComplex; 889 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 890 link->basicunit = MPIU_COMPLEX; 891 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 892 #endif 893 } else { 894 MPI_Aint lb,nbyte; 895 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 896 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 897 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 898 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 899 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 900 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 901 link->bs = nbyte; 902 link->unitbytes = nbyte; 903 link->basicunit = MPI_BYTE; 904 } else { 905 nInt = nbyte / sizeof(int); 906 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 907 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 908 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 909 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 910 link->bs = nInt; 911 link->unitbytes = nbyte; 912 link->basicunit = MPI_INT; 913 } 914 if (link->isbuiltin) link->unit = unit; 915 } 916 917 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 918 PetscFunctionReturn(0); 919 } 920 921 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 922 { 923 PetscFunctionBegin; 924 *UnpackAndOp = NULL; 925 if (mtype == PETSC_MEMTYPE_HOST) { 926 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 927 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 928 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 929 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 930 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 931 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 932 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 933 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 934 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 935 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 936 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 937 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 938 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 939 } 940 #if defined(PETSC_HAVE_CUDA) 941 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 942 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 943 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 944 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 945 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 946 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 947 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 948 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 949 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 950 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 951 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 952 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 953 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 954 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 955 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 956 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 957 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 958 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 959 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 960 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 961 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 962 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 963 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 964 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 965 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 966 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 967 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 968 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 969 } 970 #endif 971 PetscFunctionReturn(0); 972 } 973 974 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 975 { 976 PetscFunctionBegin; 977 *ScatterAndOp = NULL; 978 if (mtype == PETSC_MEMTYPE_HOST) { 979 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 980 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 981 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 982 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 983 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 984 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 985 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 986 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 987 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 988 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 989 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 990 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 991 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 992 } 993 #if defined(PETSC_HAVE_CUDA) 994 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 995 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 996 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 997 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 998 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 999 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1000 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1001 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1002 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1003 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1004 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1005 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1006 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1007 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1008 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1009 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1010 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1011 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1012 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1013 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1014 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1015 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1016 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1017 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1018 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1019 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1020 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1021 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1022 } 1023 #endif 1024 PetscFunctionReturn(0); 1025 } 1026 1027 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1028 { 1029 PetscFunctionBegin; 1030 *FetchAndOp = NULL; 1031 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1032 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1033 #if defined(PETSC_HAVE_CUDA) 1034 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1035 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1036 #endif 1037 PetscFunctionReturn(0); 1038 } 1039 1040 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1041 { 1042 PetscFunctionBegin; 1043 *FetchAndOpLocal = NULL; 1044 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1045 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1046 #if defined(PETSC_HAVE_CUDA) 1047 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1048 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1049 #endif 1050 PetscFunctionReturn(0); 1051 } 1052 1053 /*============================================================================= 1054 A set of helper routines for Pack/Unpack/Scatter on GPUs 1055 ============================================================================*/ 1056 #if defined(PETSC_HAVE_CUDA) 1057 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1058 make sure the data is ready for packing. 1059 */ 1060 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1061 { 1062 PetscFunctionBegin; 1063 if (sf->use_default_stream) PetscFunctionReturn(0); 1064 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1065 cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1066 } 1067 PetscFunctionReturn(0); 1068 } 1069 1070 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1071 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1072 { 1073 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1074 1075 PetscFunctionBegin; 1076 /* Do nothing if we use stream aware mpi || has nothing for remote */ 1077 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1078 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 1079 if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) { 1080 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1081 } 1082 PetscFunctionReturn(0); 1083 } 1084 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1085 { 1086 PetscFunctionBegin; 1087 /* See comments above */ 1088 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1089 if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) { 1090 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1091 } 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1096 does not know which stream the callers will use. 1097 */ 1098 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1099 { 1100 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1101 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 1102 1103 PetscFunctionBegin; 1104 /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 1105 if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 1106 1107 /* If rootmtype is HOST or DEVICE: 1108 If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 1109 If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 1110 i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 1111 1112 Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 1113 CPU thread might use the yet-to-be-updated rootdata pending in the stream. 1114 */ 1115 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) { 1116 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1117 } 1118 PetscFunctionReturn(0); 1119 } 1120 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1121 { 1122 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 1123 1124 PetscFunctionBegin; 1125 /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 1126 if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 1127 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) { 1128 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1129 } 1130 PetscFunctionReturn(0); 1131 } 1132 1133 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1134 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1135 */ 1136 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1137 { 1138 PetscErrorCode ierr; 1139 cudaError_t cerr; 1140 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1141 1142 PetscFunctionBegin; 1143 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1144 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1145 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1146 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1147 if (device2host) { 1148 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1149 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1150 } else { 1151 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1152 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1153 } 1154 } 1155 PetscFunctionReturn(0); 1156 } 1157 1158 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1159 { 1160 PetscErrorCode ierr; 1161 cudaError_t cerr; 1162 1163 PetscFunctionBegin; 1164 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1165 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1166 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1167 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1168 if (device2host) { 1169 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1170 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1171 } else { 1172 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1173 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1174 } 1175 } 1176 PetscFunctionReturn(0); 1177 } 1178 #else 1179 1180 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1181 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1182 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1183 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1184 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1185 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1186 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1187 1188 #endif 1189 1190 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1191 { 1192 PetscErrorCode ierr; 1193 PetscLogDouble flops; 1194 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1195 1196 1197 PetscFunctionBegin; 1198 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1199 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1200 #if defined(PETSC_HAVE_CUDA) 1201 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1202 #endif 1203 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1204 } 1205 PetscFunctionReturn(0); 1206 } 1207 1208 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1209 { 1210 PetscLogDouble flops; 1211 PetscErrorCode ierr; 1212 1213 PetscFunctionBegin; 1214 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1215 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1216 #if defined(PETSC_HAVE_CUDA) 1217 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1218 #endif 1219 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 1224 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1225 Input Arguments: 1226 +sf - The StarForest 1227 .link - The link 1228 .count - Number of entries to unpack 1229 .start - The first index, significent when indices=NULL 1230 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1231 .buf - A contiguous buffer to unpack from 1232 -op - Operation after unpack 1233 1234 Output Arguments: 1235 .data - The data to unpack to 1236 */ 1237 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1238 { 1239 PetscFunctionBegin; 1240 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1241 { 1242 PetscErrorCode ierr; 1243 PetscInt i; 1244 PetscMPIInt n; 1245 if (indices) { 1246 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1247 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1248 */ 1249 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1250 } else { 1251 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1252 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1253 } 1254 } 1255 #else 1256 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1257 #endif 1258 PetscFunctionReturn(0); 1259 } 1260 1261 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1262 { 1263 PetscFunctionBegin; 1264 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1265 { 1266 PetscErrorCode ierr; 1267 PetscInt i,disp; 1268 if (!srcIdx) { 1269 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1270 } else { 1271 for (i=0; i<count; i++) { 1272 disp = dstIdx? dstIdx[i] : dstStart + i; 1273 ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1274 } 1275 } 1276 } 1277 #else 1278 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1279 #endif 1280 PetscFunctionReturn(0); 1281 } 1282 1283 /*============================================================================= 1284 Pack/Unpack/Fetch/Scatter routines 1285 ============================================================================*/ 1286 1287 /* Pack rootdata to rootbuf 1288 Input Arguments: 1289 + sf - The SF this packing works on. 1290 . link - It gives the memtype of the roots and also provides root buffer. 1291 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1292 - rootdata - Where to read the roots. 1293 1294 Notes: 1295 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1296 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1297 */ 1298 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1299 { 1300 PetscErrorCode ierr; 1301 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1302 const PetscInt *rootindices = NULL; 1303 PetscInt count,start; 1304 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1305 PetscMemType rootmtype = link->rootmtype; 1306 PetscSFPackOpt opt = NULL; 1307 1308 1309 PetscFunctionBegin; 1310 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1311 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1312 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1313 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1314 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1315 ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1316 } 1317 if (scope == PETSCSF_REMOTE) { 1318 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1319 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1320 } 1321 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 /* Pack leafdata to leafbuf */ 1326 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1327 { 1328 PetscErrorCode ierr; 1329 const PetscInt *leafindices = NULL; 1330 PetscInt count,start; 1331 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1332 PetscMemType leafmtype = link->leafmtype; 1333 PetscSFPackOpt opt = NULL; 1334 1335 PetscFunctionBegin; 1336 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1337 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1338 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1339 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1340 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1341 ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1342 } 1343 if (scope == PETSCSF_REMOTE) { 1344 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1345 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1346 } 1347 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1348 PetscFunctionReturn(0); 1349 } 1350 1351 /* Unpack rootbuf to rootdata */ 1352 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1353 { 1354 PetscErrorCode ierr; 1355 const PetscInt *rootindices = NULL; 1356 PetscInt count,start; 1357 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1358 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1359 PetscMemType rootmtype = link->rootmtype; 1360 PetscSFPackOpt opt = NULL; 1361 1362 PetscFunctionBegin; 1363 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1364 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1365 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1366 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1367 if (UnpackAndOp) { 1368 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1369 ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1370 } else { 1371 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1372 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1373 } 1374 } 1375 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1376 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1377 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380 1381 /* Unpack leafbuf to leafdata */ 1382 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1383 { 1384 PetscErrorCode ierr; 1385 const PetscInt *leafindices = NULL; 1386 PetscInt count,start; 1387 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1388 PetscMemType leafmtype = link->leafmtype; 1389 PetscSFPackOpt opt = NULL; 1390 1391 PetscFunctionBegin; 1392 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1393 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1394 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1395 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1396 if (UnpackAndOp) { 1397 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1398 ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1399 } else { 1400 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1401 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1402 } 1403 } 1404 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1405 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1406 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1407 PetscFunctionReturn(0); 1408 } 1409 1410 /* FetchAndOp rootdata with rootbuf */ 1411 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1412 { 1413 PetscErrorCode ierr; 1414 const PetscInt *rootindices = NULL; 1415 PetscInt count,start; 1416 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1417 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1418 PetscMemType rootmtype = link->rootmtype; 1419 PetscSFPackOpt opt = NULL; 1420 1421 PetscFunctionBegin; 1422 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1423 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1424 if (bas->rootbuflen[scope]) { 1425 /* Do FetchAndOp on rootdata with rootbuf */ 1426 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1427 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1428 ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1429 } 1430 if (scope == PETSCSF_REMOTE) { 1431 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1432 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1433 } 1434 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1435 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1436 PetscFunctionReturn(0); 1437 } 1438 1439 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1440 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1441 { 1442 PetscErrorCode ierr; 1443 const PetscInt *rootindices = NULL,*leafindices = NULL; 1444 PetscInt count,rootstart,leafstart; 1445 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1446 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1447 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1448 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1449 1450 PetscFunctionBegin; 1451 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1452 if (rootmtype != leafmtype) { /* Uncommon case */ 1453 /* The local communication has to go through pack and unpack */ 1454 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1455 ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1456 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1457 } else { 1458 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1459 if (ScatterAndOp) { 1460 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1461 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1462 ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1463 } else { 1464 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1465 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1466 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1467 } 1468 } 1469 PetscFunctionReturn(0); 1470 } 1471 1472 /* Reduce leafdata to rootdata locally */ 1473 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1474 { 1475 PetscErrorCode ierr; 1476 const PetscInt *rootindices = NULL,*leafindices = NULL; 1477 PetscInt count,rootstart,leafstart; 1478 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1479 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1480 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1481 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1482 1483 PetscFunctionBegin; 1484 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1485 if (rootmtype != leafmtype) { 1486 /* The local communication has to go through pack and unpack */ 1487 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1488 ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1489 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1490 } else { 1491 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1492 if (ScatterAndOp) { 1493 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1494 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1495 ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1496 } else { 1497 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1498 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1499 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1500 } 1501 } 1502 PetscFunctionReturn(0); 1503 } 1504 1505 /* Fetch rootdata to leafdata and leafupdate locally */ 1506 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1507 { 1508 PetscErrorCode ierr; 1509 const PetscInt *rootindices = NULL,*leafindices = NULL; 1510 PetscInt count,rootstart,leafstart; 1511 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1512 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1513 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1514 PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1515 1516 PetscFunctionBegin; 1517 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1518 if (rootmtype != leafmtype) { 1519 /* The local communication has to go through pack and unpack */ 1520 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1521 } else { 1522 ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1523 ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1524 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1525 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1526 } 1527 PetscFunctionReturn(0); 1528 } 1529 1530 /* 1531 Create per-rank pack/unpack optimizations based on indice patterns 1532 1533 Input Parameters: 1534 + n - Number of destination ranks 1535 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1536 - idx - [*] Array storing indices 1537 1538 Output Parameters: 1539 + opt - Pack optimizations. NULL if no optimizations. 1540 */ 1541 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1542 { 1543 PetscErrorCode ierr; 1544 PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1545 PetscBool optimizable = PETSC_TRUE; 1546 PetscSFPackOpt opt; 1547 1548 PetscFunctionBegin; 1549 ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1550 ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1551 opt->n = opt->array[0] = n; 1552 opt->offset = opt->array + 1; 1553 opt->start = opt->array + n + 2; 1554 opt->dx = opt->array + 2*n + 2; 1555 opt->dy = opt->array + 3*n + 2; 1556 opt->dz = opt->array + 4*n + 2; 1557 opt->X = opt->array + 5*n + 2; 1558 opt->Y = opt->array + 6*n + 2; 1559 1560 for (r=0; r<n; r++) { /* For each destination rank */ 1561 m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1562 p = offset[r]; 1563 start = idx[p]; /* First index for this rank */ 1564 p++; 1565 1566 /* Search in X dimension */ 1567 for (dx=1; dx<m; dx++,p++) { 1568 if (start+dx != idx[p]) break; 1569 } 1570 1571 dydz = m/dx; 1572 X = dydz > 1 ? (idx[p]-start) : dx; 1573 /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1574 if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1575 for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1576 for (i=0; i<dx; i++,p++) { 1577 if (start+X*dy+i != idx[p]) { 1578 if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1579 else goto Z_dimension; 1580 } 1581 } 1582 } 1583 1584 Z_dimension: 1585 dz = m/(dx*dy); 1586 Y = dz > 1 ? (idx[p]-start)/X : dy; 1587 /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1588 if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1589 for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1590 for (j=0; j<dy; j++) { 1591 for (i=0; i<dx; i++,p++) { 1592 if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 1593 } 1594 } 1595 } 1596 opt->start[r] = start; 1597 opt->dx[r] = dx; 1598 opt->dy[r] = dy; 1599 opt->dz[r] = dz; 1600 opt->X[r] = X; 1601 opt->Y[r] = Y; 1602 } 1603 1604 finish: 1605 /* If not optimizable, free arrays to save memory */ 1606 if (!n || !optimizable) { 1607 ierr = PetscFree(opt->array);CHKERRQ(ierr); 1608 ierr = PetscFree(opt);CHKERRQ(ierr); 1609 *out = NULL; 1610 } else { 1611 opt->offset[0] = 0; 1612 for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1613 *out = opt; 1614 } 1615 PetscFunctionReturn(0); 1616 } 1617 1618 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out) 1619 { 1620 PetscErrorCode ierr; 1621 PetscSFPackOpt opt = *out; 1622 1623 PetscFunctionBegin; 1624 if (opt) { 1625 if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);} 1626 #if defined(PETSC_HAVE_CUDA) 1627 else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;} 1628 #endif 1629 ierr = PetscFree(opt);CHKERRQ(ierr); 1630 *out = NULL; 1631 } 1632 PetscFunctionReturn(0); 1633 } 1634 1635 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1636 { 1637 PetscErrorCode ierr; 1638 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1639 PetscInt i,j; 1640 1641 PetscFunctionBegin; 1642 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1643 for (i=0; i<2; i++) { /* Set defaults */ 1644 sf->leafstart[i] = 0; 1645 sf->leafcontig[i] = PETSC_TRUE; 1646 sf->leafdups[i] = PETSC_FALSE; 1647 bas->rootstart[i] = 0; 1648 bas->rootcontig[i] = PETSC_TRUE; 1649 bas->rootdups[i] = PETSC_FALSE; 1650 } 1651 1652 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1653 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1654 1655 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1656 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1657 1658 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1659 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1660 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1661 } 1662 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1663 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1664 } 1665 1666 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1667 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1668 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1669 1670 /* Are root indices for self and remote contiguous? */ 1671 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1672 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1673 1674 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1675 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1676 1677 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1678 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1679 } 1680 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1681 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1682 } 1683 1684 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1685 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1686 1687 #if defined(PETSC_HAVE_CUDA) 1688 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1689 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1690 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1691 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1692 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1693 #endif 1694 1695 PetscFunctionReturn(0); 1696 } 1697 1698 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1699 { 1700 PetscErrorCode ierr; 1701 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1702 PetscInt i; 1703 1704 PetscFunctionBegin; 1705 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1706 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1707 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1708 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1709 ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1710 } 1711 PetscFunctionReturn(0); 1712 } 1713