1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> 7 #endif 8 /* 9 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 10 * therefore we pack data types manually. This file defines packing routines for the standard data types. 11 */ 12 13 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 14 15 /* Operations working like s += t */ 16 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 17 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 18 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 19 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 20 /* Ref MPI MAXLOC */ 21 #define OP_XLOC(op,s,t) \ 22 do { \ 23 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 24 else if (!((s).u op (t).u)) s = t; \ 25 } while(0) 26 27 /* DEF_PackFunc - macro defining a Pack routine 28 29 Arguments of the macro: 30 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 31 .BS Block size for vectorization. It is a factor of bs. 32 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 33 34 Arguments of the Pack routine: 35 +count Number of indices in idx[]. 36 .start If indices are contiguous, it is the first index; otherwise, not used. 37 .idx Indices of entries to packed. NULL means contiguous indices, that is [start,start+count). 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .opt Per-remote pack optimization plan. NULL means no such plan. 40 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 41 -packed Address of the packed data. 42 */ 43 #define DEF_PackFunc(Type,BS,EQ) \ 44 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45 { \ 46 PetscErrorCode ierr; \ 47 const Type *u = (const Type*)unpacked,*u2; \ 48 Type *p = (Type*)packed,*p2; \ 49 PetscInt i,j,k,l,r,step,bs=link->bs; \ 50 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 52 PetscFunctionBegin; \ 53 if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54 else if (!opt) { /* No optimizations available */ \ 55 for (i=0; i<count; i++) \ 56 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59 } else { \ 60 for (r=0; r<opt->n; r++) { \ 61 p2 = p + opt->offset[r]*MBS; \ 62 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63 idx2 = idx + opt->offset[r]; \ 64 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65 for (j=0; j<M; j++) \ 66 for (k=0; k<BS; k++) \ 67 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 69 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70 u2 = u + idx[opt->copy_start[i]]*MBS; \ 71 l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72 ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73 p2 += l; \ 74 } \ 75 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76 u2 = u + idx[opt->offset[r]]*MBS; \ 77 step = opt->stride_step[r]; \ 78 for (i=0; i<opt->stride_n[r]; i++) \ 79 for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 81 } \ 82 } \ 83 PetscFunctionReturn(0); \ 84 } 85 86 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 87 and inserts into a sparse array. 88 89 Arguments: 90 .Type Type of the data 91 .BS Block size for vectorization 92 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 93 94 Notes: 95 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 96 */ 97 #define DEF_UnpackFunc(Type,BS,EQ) \ 98 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 99 { \ 100 PetscErrorCode ierr; \ 101 Type *u = (Type*)unpacked,*u2; \ 102 const Type *p = (const Type*)packed,*p2; \ 103 PetscInt i,j,k,l,r,step,bs=link->bs; \ 104 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 105 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 106 PetscFunctionBegin; \ 107 if (!idx) { \ 108 u2 = u + start*bs; \ 109 if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);} \ 110 } else if (!opt) { /* No optimizations available */ \ 111 for (i=0; i<count; i++) \ 112 for (j=0; j<M; j++) \ 113 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 114 } else { \ 115 for (r=0; r<opt->n; r++) { \ 116 p2 = p + opt->offset[r]*MBS; \ 117 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 118 idx2 = idx + opt->offset[r]; \ 119 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 120 for (j=0; j<M; j++) \ 121 for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 122 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 123 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 124 u2 = u + idx[opt->copy_start[i]]*MBS; \ 125 l = opt->copy_length[i]*MBS; \ 126 ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 127 p2 += l; \ 128 } \ 129 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 130 u2 = u + idx[opt->offset[r]]*MBS; \ 131 step = opt->stride_step[r]; \ 132 for (i=0; i<opt->stride_n[r]; i++) \ 133 for (j=0; j<M; j++) \ 134 for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 135 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 136 } \ 137 } \ 138 PetscFunctionReturn(0); \ 139 } 140 141 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 142 143 Arguments: 144 +Opname Name of the Op, such as Add, Mult, LAND, etc. 145 .Type Type of the data 146 .BS Block size for vectorization 147 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 148 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 149 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 150 */ 151 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 152 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 153 { \ 154 Type *u = (Type*)unpacked,*u2; \ 155 const Type *p = (const Type*)packed,*p2; \ 156 PetscInt i,j,k,l,r,step,bs=link->bs; \ 157 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 158 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 159 PetscFunctionBegin; \ 160 if (!idx) { \ 161 u += start*bs; \ 162 for (i=0; i<count; i++) \ 163 for (j=0; j<M; j++) \ 164 for (k=0; k<BS; k++) \ 165 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166 } else if (!opt) { /* No optimizations available */ \ 167 for (i=0; i<count; i++) \ 168 for (j=0; j<M; j++) \ 169 for (k=0; k<BS; k++) \ 170 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 171 } else { \ 172 for (r=0; r<opt->n; r++) { \ 173 p2 = p + opt->offset[r]*MBS; \ 174 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 175 idx2 = idx + opt->offset[r]; \ 176 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 177 for (j=0; j<M; j++) \ 178 for (k=0; k<BS; k++) \ 179 OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 180 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 181 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 182 u2 = u + idx[opt->copy_start[i]]*MBS; \ 183 l = opt->copy_length[i]*MBS; \ 184 for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]); \ 185 p2 += l; \ 186 } \ 187 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 188 u2 = u + idx[opt->offset[r]]*MBS; \ 189 step = opt->stride_step[r]; \ 190 for (i=0; i<opt->stride_n[r]; i++) \ 191 for (j=0; j<M; j++) \ 192 for (k=0; k<BS; k++) \ 193 OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 194 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 195 } \ 196 } \ 197 PetscFunctionReturn(0); \ 198 } 199 200 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 201 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \ 202 { \ 203 Type *u = (Type*)unpacked,*p = (Type*)packed,t; \ 204 PetscInt i,j,k,bs=link->bs; \ 205 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 206 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 207 PetscFunctionBegin; \ 208 if (!idx) { \ 209 u += start*bs; \ 210 for (i=0; i<count; i++) \ 211 for (j=0; j<M; j++) \ 212 for (k=0; k<BS; k++) { \ 213 t = u[i*MBS+j*BS+k]; \ 214 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 215 p[i*MBS+j*BS+k] = t; \ 216 } \ 217 } else { \ 218 for (i=0; i<count; i++) \ 219 for (j=0; j<M; j++) \ 220 for (k=0; k<BS; k++) { \ 221 t = u[idx[i]*MBS+j*BS+k]; \ 222 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 223 p[i*MBS+j*BS+k] = t; \ 224 } \ 225 } \ 226 PetscFunctionReturn(0); \ 227 } 228 229 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 230 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \ 231 { \ 232 const Type *x = (const Type*)xdata; \ 233 Type *y = (Type*)ydata; \ 234 PetscInt i,j,k,bs = link->bs; \ 235 const PetscInt M = (EQ) ? 1 : bs/BS; \ 236 const PetscInt MBS = M*BS; \ 237 PetscFunctionBegin; \ 238 if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\ 239 if (!idy) { \ 240 y += starty*bs; \ 241 for (i=0; i<count; i++) \ 242 for (j=0; j<M; j++) \ 243 for (k=0; k<BS; k++) \ 244 OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 245 } else { \ 246 for (i=0; i<count; i++) \ 247 for (j=0; j<M; j++) \ 248 for (k=0; k<BS; k++) \ 249 OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 250 } \ 251 PetscFunctionReturn(0); \ 252 } 253 254 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 255 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \ 256 { \ 257 Type *x = (Type*)rootdata,*y2 = (Type*)leafupdate; \ 258 const Type *y = (const Type*)leafdata; \ 259 PetscInt i,j,k,bs = link->bs; \ 260 const PetscInt M = (EQ) ? 1 : bs/BS; \ 261 const PetscInt MBS = M*BS; \ 262 PetscFunctionBegin; \ 263 if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\ 264 if (!leafindices) { \ 265 y += leafstart*bs; y2 += leafstart*bs; \ 266 for (i=0; i<count; i++) \ 267 for (j=0; j<M; j++) \ 268 for (k=0; k<BS; k++) { \ 269 y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 270 OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]); \ 271 } \ 272 } else { \ 273 for (i=0; i<count; i++) \ 274 for (j=0; j<M; j++) \ 275 for (k=0; k<BS; k++) { \ 276 y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 277 OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]); \ 278 } \ 279 } \ 280 PetscFunctionReturn(0); \ 281 } 282 283 284 /* Pack, Unpack/Fetch ops */ 285 #define DEF_Pack(Type,BS,EQ) \ 286 DEF_PackFunc(Type,BS,EQ) \ 287 DEF_UnpackFunc(Type,BS,EQ) \ 288 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 289 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 291 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 292 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 293 } 294 295 /* Add, Mult ops */ 296 #define DEF_Add(Type,BS,EQ) \ 297 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 298 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 299 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 300 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 301 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 302 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 303 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 304 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 305 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 306 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 307 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 308 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 309 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 310 } 311 312 /* Max, Min ops */ 313 #define DEF_Cmp(Type,BS,EQ) \ 314 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 315 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 316 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 317 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 318 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 319 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 320 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 321 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 322 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 323 } 324 325 /* Logical ops. 326 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 327 the compilation warning "empty macro arguments are undefined in ISO C90" 328 */ 329 #define DEF_Log(Type,BS,EQ) \ 330 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 331 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 332 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 333 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 334 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 335 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 336 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 337 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 338 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 339 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 340 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 341 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 342 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 343 } 344 345 /* Bitwise ops */ 346 #define DEF_Bit(Type,BS,EQ) \ 347 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 348 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 349 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 350 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 351 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 352 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 353 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 354 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 355 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 356 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 357 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 358 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 359 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 360 } 361 362 /* Maxloc, Minloc ops */ 363 #define DEF_Xloc(Type,BS,EQ) \ 364 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 365 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 366 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 367 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 368 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 369 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 370 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 371 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 372 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 373 } 374 375 #define DEF_IntegerType(Type,BS,EQ) \ 376 DEF_Pack(Type,BS,EQ) \ 377 DEF_Add(Type,BS,EQ) \ 378 DEF_Cmp(Type,BS,EQ) \ 379 DEF_Log(Type,BS,EQ) \ 380 DEF_Bit(Type,BS,EQ) \ 381 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 382 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 383 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 384 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 385 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 386 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 387 } 388 389 #define DEF_RealType(Type,BS,EQ) \ 390 DEF_Pack(Type,BS,EQ) \ 391 DEF_Add(Type,BS,EQ) \ 392 DEF_Cmp(Type,BS,EQ) \ 393 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 394 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 395 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 396 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 397 } 398 399 #if defined(PETSC_HAVE_COMPLEX) 400 #define DEF_ComplexType(Type,BS,EQ) \ 401 DEF_Pack(Type,BS,EQ) \ 402 DEF_Add(Type,BS,EQ) \ 403 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 404 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 405 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 406 } 407 #endif 408 409 #define DEF_DumbType(Type,BS,EQ) \ 410 DEF_Pack(Type,BS,EQ) \ 411 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 412 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 413 } 414 415 /* Maxloc, Minloc */ 416 #define DEF_PairType(Type,BS,EQ) \ 417 DEF_Pack(Type,BS,EQ) \ 418 DEF_Xloc(Type,BS,EQ) \ 419 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 420 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 421 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 422 } 423 424 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 425 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 426 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 427 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 428 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 429 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 430 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 431 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 432 433 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 434 DEF_IntegerType(int,1,1) 435 DEF_IntegerType(int,2,1) 436 DEF_IntegerType(int,4,1) 437 DEF_IntegerType(int,8,1) 438 DEF_IntegerType(int,1,0) 439 DEF_IntegerType(int,2,0) 440 DEF_IntegerType(int,4,0) 441 DEF_IntegerType(int,8,0) 442 #endif 443 444 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 445 typedef signed char SignedChar; 446 DEF_IntegerType(SignedChar,1,1) 447 DEF_IntegerType(SignedChar,2,1) 448 DEF_IntegerType(SignedChar,4,1) 449 DEF_IntegerType(SignedChar,8,1) 450 DEF_IntegerType(SignedChar,1,0) 451 DEF_IntegerType(SignedChar,2,0) 452 DEF_IntegerType(SignedChar,4,0) 453 DEF_IntegerType(SignedChar,8,0) 454 455 typedef unsigned char UnsignedChar; 456 DEF_IntegerType(UnsignedChar,1,1) 457 DEF_IntegerType(UnsignedChar,2,1) 458 DEF_IntegerType(UnsignedChar,4,1) 459 DEF_IntegerType(UnsignedChar,8,1) 460 DEF_IntegerType(UnsignedChar,1,0) 461 DEF_IntegerType(UnsignedChar,2,0) 462 DEF_IntegerType(UnsignedChar,4,0) 463 DEF_IntegerType(UnsignedChar,8,0) 464 465 DEF_RealType(PetscReal,1,1) 466 DEF_RealType(PetscReal,2,1) 467 DEF_RealType(PetscReal,4,1) 468 DEF_RealType(PetscReal,8,1) 469 DEF_RealType(PetscReal,1,0) 470 DEF_RealType(PetscReal,2,0) 471 DEF_RealType(PetscReal,4,0) 472 DEF_RealType(PetscReal,8,0) 473 474 #if defined(PETSC_HAVE_COMPLEX) 475 DEF_ComplexType(PetscComplex,1,1) 476 DEF_ComplexType(PetscComplex,2,1) 477 DEF_ComplexType(PetscComplex,4,1) 478 DEF_ComplexType(PetscComplex,8,1) 479 DEF_ComplexType(PetscComplex,1,0) 480 DEF_ComplexType(PetscComplex,2,0) 481 DEF_ComplexType(PetscComplex,4,0) 482 DEF_ComplexType(PetscComplex,8,0) 483 #endif 484 485 #define PairType(Type1,Type2) Type1##_##Type2 486 typedef struct {int u; int i;} PairType(int,int); 487 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 488 DEF_PairType(PairType(int,int),1,1) 489 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 490 491 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 492 DEF_DumbType(char,1,1) 493 DEF_DumbType(char,2,1) 494 DEF_DumbType(char,4,1) 495 DEF_DumbType(char,1,0) 496 DEF_DumbType(char,2,0) 497 DEF_DumbType(char,4,0) 498 499 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 500 DEF_DumbType(DumbInt,1,1) 501 DEF_DumbType(DumbInt,2,1) 502 DEF_DumbType(DumbInt,4,1) 503 DEF_DumbType(DumbInt,8,1) 504 DEF_DumbType(DumbInt,1,0) 505 DEF_DumbType(DumbInt,2,0) 506 DEF_DumbType(DumbInt,4,0) 507 DEF_DumbType(DumbInt,8,0) 508 509 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 510 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 511 { 512 int ierr; 513 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 514 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 515 return MPI_SUCCESS; 516 } 517 #endif 518 519 /* 520 The routine Creates a communication link for the given operation. It first looks up its link cache. If 521 there is a free & suitable one, it uses it. Otherwise it creates a new one. 522 523 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 524 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 525 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 526 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 527 528 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 529 530 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 531 532 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 533 need pack/unpack data. 534 */ 535 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 536 { 537 PetscErrorCode ierr; 538 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 539 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 540 PetscSFLink *p,link; 541 PetscSFDirection direction; 542 MPI_Request *reqs = NULL; 543 PetscBool match,rootdirect[2],leafdirect[2]; 544 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 545 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 546 547 PetscFunctionBegin; 548 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 549 550 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 551 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 552 if (sfop == PETSCSF_BCAST) { 553 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 554 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 555 } else if (sfop == PETSCSF_REDUCE) { 556 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 557 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 558 } else { /* PETSCSF_FETCH */ 559 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 560 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 561 } 562 } 563 564 if (use_gpu_aware_mpi) { 565 rootmtype_mpi = rootmtype; 566 leafmtype_mpi = leafmtype; 567 } else { 568 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 569 } 570 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 571 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 572 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 573 574 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 575 nrootreqs = bas->nrootreqs; 576 nleafreqs = sf->nleafreqs; 577 578 /* Look for free links in cache */ 579 for (p=&bas->avail; (link=*p); p=&link->next) { 580 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 581 if (match) { 582 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 583 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 584 */ 585 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 586 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 587 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 588 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 589 } 590 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 591 reqs = link->leafreqs[direction][leafmtype][1]; 592 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 593 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 594 } 595 *p = link->next; /* Remove from available list */ 596 goto found; 597 } 598 } 599 600 ierr = PetscNew(&link);CHKERRQ(ierr); 601 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 602 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 603 604 nreqs = (nrootreqs+nleafreqs)*8; 605 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 606 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 607 608 for (i=0; i<2; i++) { /* Two communication directions */ 609 for (j=0; j<2; j++) { /* Two memory types */ 610 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 611 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 612 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 613 } 614 } 615 } 616 617 found: 618 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 619 620 /* Allocate buffers along root/leafdata */ 621 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 622 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 623 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 624 if (bas->rootbuflen[i]) { 625 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 626 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 627 } else { /* Have to have a separate rootbuf */ 628 if (!link->rootbuf_alloc[i][rootmtype]) { 629 ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 630 } 631 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 632 } 633 } 634 635 if (sf->leafbuflen[i]) { 636 if (leafdirect[i]) { 637 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 638 } else { 639 if (!link->leafbuf_alloc[i][leafmtype]) { 640 ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 641 } 642 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 643 } 644 } 645 } 646 647 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 648 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 649 if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 650 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 651 } 652 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 653 } 654 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 655 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 656 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 657 } 658 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 659 } 660 661 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 662 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 663 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 664 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 665 } 666 667 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 668 link->leafdata = leafdata; 669 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 670 link->rootdirect[i] = rootdirect[i]; 671 link->leafdirect[i] = leafdirect[i]; 672 } 673 link->rootdirect_mpi = rootdirect_mpi; 674 link->leafdirect_mpi = leafdirect_mpi; 675 link->rootmtype = rootmtype; 676 link->leafmtype = leafmtype; 677 link->rootmtype_mpi = rootmtype_mpi; 678 link->leafmtype_mpi = leafmtype_mpi; 679 680 link->next = bas->inuse; 681 bas->inuse = link; 682 *mylink = link; 683 PetscFunctionReturn(0); 684 } 685 686 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 687 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 688 */ 689 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 690 { 691 PetscErrorCode ierr; 692 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 693 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 694 const PetscInt *rootoffset,*leafoffset; 695 PetscMPIInt n; 696 MPI_Aint disp; 697 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 698 MPI_Datatype unit = link->unit; 699 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 700 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 701 702 PetscFunctionBegin; 703 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 704 if (sf->persistent) { 705 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 706 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 707 if (direction == PETSCSF_LEAF2ROOT) { 708 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 709 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 710 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 711 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 712 } 713 } else { /* PETSCSF_ROOT2LEAF */ 714 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 715 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 716 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 717 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 718 } 719 } 720 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 721 } 722 723 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 724 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 725 if (direction == PETSCSF_LEAF2ROOT) { 726 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 727 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 728 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 729 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 730 } 731 } else { /* PETSCSF_ROOT2LEAF */ 732 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 733 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 734 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 735 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 736 } 737 } 738 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 739 } 740 } 741 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 742 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 743 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 744 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 745 PetscFunctionReturn(0); 746 } 747 748 749 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 750 { 751 PetscErrorCode ierr; 752 PetscSFLink link,*p; 753 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 754 755 PetscFunctionBegin; 756 /* Look for types in cache */ 757 for (p=&bas->inuse; (link=*p); p=&link->next) { 758 PetscBool match; 759 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 760 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 761 switch (cmode) { 762 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 763 case PETSC_USE_POINTER: break; 764 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 765 } 766 *mylink = link; 767 PetscFunctionReturn(0); 768 } 769 } 770 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 771 PetscFunctionReturn(0); 772 } 773 774 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 775 { 776 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 777 778 PetscFunctionBegin; 779 (*link)->rootdata = NULL; 780 (*link)->leafdata = NULL; 781 (*link)->next = bas->avail; 782 bas->avail = *link; 783 *link = NULL; 784 PetscFunctionReturn(0); 785 } 786 787 /* Destroy all links chained in 'avail' */ 788 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 789 { 790 PetscErrorCode ierr; 791 PetscSFLink link = *avail,next; 792 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 793 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 794 795 PetscFunctionBegin; 796 for (; link; link=next) { 797 next = link->next; 798 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 799 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 800 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 801 } 802 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 803 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 804 #if defined(PETSC_HAVE_CUDA) 805 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 806 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 807 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 808 #endif 809 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 810 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 811 } 812 ierr = PetscFree(link);CHKERRQ(ierr); 813 } 814 *avail = NULL; 815 PetscFunctionReturn(0); 816 } 817 818 #if defined(PETSC_USE_DEBUG) 819 /* Error out on unsupported overlapped communications */ 820 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 821 { 822 PetscErrorCode ierr; 823 PetscSFLink link,*p; 824 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 825 PetscBool match; 826 827 PetscFunctionBegin; 828 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 829 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 830 */ 831 if (rootdata || leafdata) { 832 for (p=&bas->inuse; (link=*p); p=&link->next) { 833 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 834 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 835 } 836 } 837 PetscFunctionReturn(0); 838 } 839 #endif 840 841 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 842 { 843 PetscErrorCode ierr; 844 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 845 PetscBool is2Int,is2PetscInt; 846 PetscMPIInt ni,na,nd,combiner; 847 #if defined(PETSC_HAVE_COMPLEX) 848 PetscInt nPetscComplex=0; 849 #endif 850 851 PetscFunctionBegin; 852 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 853 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 854 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 855 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 856 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 857 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 858 #if defined(PETSC_HAVE_COMPLEX) 859 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 860 #endif 861 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 862 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 863 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 864 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 865 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 866 link->bs = 1; /* default */ 867 868 if (is2Int) { 869 PackInit_PairType_int_int_1_1(link); 870 link->bs = 1; 871 link->unitbytes = 2*sizeof(int); 872 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 873 link->basicunit = MPI_2INT; 874 link->unit = MPI_2INT; 875 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 876 PackInit_PairType_PetscInt_PetscInt_1_1(link); 877 link->bs = 1; 878 link->unitbytes = 2*sizeof(PetscInt); 879 link->basicunit = MPIU_2INT; 880 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 881 link->unit = MPIU_2INT; 882 } else if (nPetscReal) { 883 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 884 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 885 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 886 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 887 link->bs = nPetscReal; 888 link->unitbytes = nPetscReal*sizeof(PetscReal); 889 link->basicunit = MPIU_REAL; 890 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 891 } else if (nPetscInt) { 892 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 893 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 894 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 895 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 896 link->bs = nPetscInt; 897 link->unitbytes = nPetscInt*sizeof(PetscInt); 898 link->basicunit = MPIU_INT; 899 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 900 #if defined(PETSC_USE_64BIT_INDICES) 901 } else if (nInt) { 902 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 903 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 904 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 905 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 906 link->bs = nInt; 907 link->unitbytes = nInt*sizeof(int); 908 link->basicunit = MPI_INT; 909 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 910 #endif 911 } else if (nSignedChar) { 912 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 913 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 914 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 915 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 916 link->bs = nSignedChar; 917 link->unitbytes = nSignedChar*sizeof(SignedChar); 918 link->basicunit = MPI_SIGNED_CHAR; 919 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 920 } else if (nUnsignedChar) { 921 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 922 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 923 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 924 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 925 link->bs = nUnsignedChar; 926 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 927 link->basicunit = MPI_UNSIGNED_CHAR; 928 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 929 #if defined(PETSC_HAVE_COMPLEX) 930 } else if (nPetscComplex) { 931 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 932 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 933 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 934 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 935 link->bs = nPetscComplex; 936 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 937 link->basicunit = MPIU_COMPLEX; 938 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 939 #endif 940 } else { 941 MPI_Aint lb,nbyte; 942 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 943 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 944 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 945 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 946 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 947 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 948 link->bs = nbyte; 949 link->unitbytes = nbyte; 950 link->basicunit = MPI_BYTE; 951 } else { 952 nInt = nbyte / sizeof(int); 953 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 954 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 955 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 956 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 957 link->bs = nInt; 958 link->unitbytes = nbyte; 959 link->basicunit = MPI_INT; 960 } 961 if (link->isbuiltin) link->unit = unit; 962 } 963 964 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 965 PetscFunctionReturn(0); 966 } 967 968 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*)) 969 { 970 PetscFunctionBegin; 971 *UnpackAndOp = NULL; 972 if (mtype == PETSC_MEMTYPE_HOST) { 973 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 974 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 975 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 976 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 977 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 978 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 979 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 980 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 981 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 982 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 983 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 984 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 985 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 986 } 987 #if defined(PETSC_HAVE_CUDA) 988 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 989 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 990 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 991 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 992 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 993 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 994 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 995 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 996 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 997 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 998 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 999 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 1000 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 1001 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 1002 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1003 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 1004 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 1005 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 1006 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 1007 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 1008 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 1009 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 1010 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 1011 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 1012 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1013 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1014 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1015 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1016 } 1017 #endif 1018 PetscFunctionReturn(0); 1019 } 1020 1021 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*)) 1022 { 1023 PetscFunctionBegin; 1024 *ScatterAndOp = NULL; 1025 if (mtype == PETSC_MEMTYPE_HOST) { 1026 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1027 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1028 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1029 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1030 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1031 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1032 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1033 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1034 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1035 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1036 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1037 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1038 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1039 } 1040 #if defined(PETSC_HAVE_CUDA) 1041 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1042 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1043 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1044 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1045 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1046 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1047 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1048 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1049 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1050 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1051 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1052 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1053 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1054 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1055 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1056 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1057 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1058 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1059 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1060 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1061 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1062 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1063 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1064 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1065 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1066 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1067 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1068 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1069 } 1070 #endif 1071 PetscFunctionReturn(0); 1072 } 1073 1074 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*)) 1075 { 1076 PetscFunctionBegin; 1077 *FetchAndOp = NULL; 1078 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1079 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1080 #if defined(PETSC_HAVE_CUDA) 1081 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1082 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1083 #endif 1084 PetscFunctionReturn(0); 1085 } 1086 1087 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*)) 1088 { 1089 PetscFunctionBegin; 1090 *FetchAndOpLocal = NULL; 1091 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1092 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1093 #if defined(PETSC_HAVE_CUDA) 1094 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1095 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1096 #endif 1097 PetscFunctionReturn(0); 1098 } 1099 1100 /*============================================================================= 1101 A set of helper routines for Pack/Unpack/Scatter on GPUs 1102 ============================================================================*/ 1103 #if defined(PETSC_HAVE_CUDA) 1104 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1105 make sure the data is ready for packing. 1106 */ 1107 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1108 { 1109 PetscFunctionBegin; 1110 if (sf->use_default_stream) PetscFunctionReturn(0); 1111 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1112 cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1113 } 1114 PetscFunctionReturn(0); 1115 } 1116 1117 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1118 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1119 { 1120 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1121 1122 PetscFunctionBegin; 1123 /* rootdata is on device && send non-empty to remote && (we called a packing kernel || we async-copied rootdata from device to host) */ 1124 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && bas->rootbuflen[PETSCSF_REMOTE] && (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) { 1125 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1126 } 1127 PetscFunctionReturn(0); 1128 } 1129 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1130 { 1131 PetscFunctionBegin; 1132 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->leafbuflen[PETSCSF_REMOTE] && (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) { 1133 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1134 } 1135 PetscFunctionReturn(0); 1136 } 1137 1138 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1139 does not know which stream the callers will use. 1140 */ 1141 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1142 { 1143 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1144 1145 PetscFunctionBegin; 1146 if (sf->use_default_stream) PetscFunctionReturn(0); 1147 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (bas->rootbuflen[PETSCSF_LOCAL] || bas->rootbuflen[PETSCSF_REMOTE])) { 1148 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1149 } 1150 PetscFunctionReturn(0); 1151 } 1152 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1153 { 1154 PetscFunctionBegin; 1155 if (sf->use_default_stream) PetscFunctionReturn(0); 1156 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (sf->leafbuflen[PETSCSF_LOCAL] || sf->leafbuflen[PETSCSF_REMOTE])) { 1157 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1158 } 1159 PetscFunctionReturn(0); 1160 } 1161 1162 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1163 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1164 */ 1165 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1166 { 1167 PetscErrorCode ierr; 1168 cudaError_t cerr; 1169 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1170 1171 PetscFunctionBegin; 1172 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1173 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1174 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1175 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1176 if (device2host) { 1177 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1178 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1179 } else { 1180 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1181 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1182 } 1183 } 1184 PetscFunctionReturn(0); 1185 } 1186 1187 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1188 { 1189 PetscErrorCode ierr; 1190 cudaError_t cerr; 1191 1192 PetscFunctionBegin; 1193 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1194 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1195 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1196 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1197 if (device2host) { 1198 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1199 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1200 } else { 1201 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1202 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1203 } 1204 } 1205 PetscFunctionReturn(0); 1206 } 1207 #else 1208 1209 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1210 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1211 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1212 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1213 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1214 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1215 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1216 1217 #endif 1218 1219 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1220 { 1221 PetscErrorCode ierr; 1222 PetscLogDouble flops; 1223 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1224 1225 1226 PetscFunctionBegin; 1227 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1228 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1229 #if defined(PETSC_HAVE_CUDA) 1230 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1231 #endif 1232 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1233 } 1234 PetscFunctionReturn(0); 1235 } 1236 1237 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1238 { 1239 PetscLogDouble flops; 1240 PetscErrorCode ierr; 1241 1242 PetscFunctionBegin; 1243 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1244 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1245 #if defined(PETSC_HAVE_CUDA) 1246 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1247 #endif 1248 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1249 } 1250 PetscFunctionReturn(0); 1251 } 1252 1253 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1254 Input Arguments: 1255 +sf - The StarForest 1256 .link - The link 1257 .count - Number of entries to unpack 1258 .start - The first index, significent when indices=NULL 1259 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1260 .buf - A contiguous buffer to unpack from 1261 -op - Operation after unpack 1262 1263 Output Arguments: 1264 .data - The data to unpack to 1265 */ 1266 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1267 { 1268 PetscFunctionBegin; 1269 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1270 { 1271 PetscErrorCode ierr; 1272 PetscInt i; 1273 PetscMPIInt n; 1274 if (indices) { 1275 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1276 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1277 */ 1278 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1279 } else { 1280 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1281 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1282 } 1283 } 1284 #else 1285 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1286 #endif 1287 PetscFunctionReturn(0); 1288 } 1289 1290 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op) 1291 { 1292 PetscFunctionBegin; 1293 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1294 { 1295 PetscErrorCode ierr; 1296 PetscInt i,disp; 1297 for (i=0; i<count; i++) { 1298 disp = idy? idy[i] : starty + i; 1299 ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1300 } 1301 } 1302 #else 1303 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1304 #endif 1305 PetscFunctionReturn(0); 1306 } 1307 1308 /*============================================================================= 1309 Pack/Unpack/Fetch/Scatter routines 1310 ============================================================================*/ 1311 1312 /* Pack rootdata to rootbuf 1313 Input Arguments: 1314 + sf - The SF this packing works on. 1315 . link - It gives the memtype of the roots and also provides root buffer. 1316 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1317 - rootdata - Where to read the roots. 1318 1319 Notes: 1320 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1321 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1322 */ 1323 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1324 { 1325 PetscErrorCode ierr; 1326 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1327 const PetscInt *rootindices = NULL; 1328 PetscInt count,start; 1329 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1330 PetscMemType rootmtype = link->rootmtype; 1331 1332 PetscFunctionBegin; 1333 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1334 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1335 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1336 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1337 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1338 ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1339 } 1340 if (scope == PETSCSF_REMOTE) { 1341 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1342 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1343 } 1344 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 /* Pack leafdata to leafbuf */ 1349 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1350 { 1351 PetscErrorCode ierr; 1352 const PetscInt *leafindices = NULL; 1353 PetscInt count,start; 1354 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1355 PetscMemType leafmtype = link->leafmtype; 1356 1357 PetscFunctionBegin; 1358 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1359 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1360 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1361 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1362 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1363 ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1364 } 1365 if (scope == PETSCSF_REMOTE) { 1366 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1367 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1368 } 1369 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 /* Unpack rootbuf to rootdata */ 1374 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1375 { 1376 PetscErrorCode ierr; 1377 const PetscInt *rootindices = NULL; 1378 PetscInt count,start; 1379 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1380 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1381 PetscMemType rootmtype = link->rootmtype; 1382 1383 PetscFunctionBegin; 1384 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1385 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1386 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1387 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1388 if (UnpackAndOp) { 1389 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1390 ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1391 } else { 1392 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1393 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1394 } 1395 } 1396 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1397 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1398 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1399 PetscFunctionReturn(0); 1400 } 1401 1402 /* Unpack leafbuf to leafdata */ 1403 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1404 { 1405 PetscErrorCode ierr; 1406 const PetscInt *leafindices = NULL; 1407 PetscInt count,start; 1408 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1409 PetscMemType leafmtype = link->leafmtype; 1410 1411 PetscFunctionBegin; 1412 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1413 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1414 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1415 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1416 if (UnpackAndOp) { 1417 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1418 ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1419 } else { 1420 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1421 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1422 } 1423 } 1424 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1425 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1426 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 /* FetchAndOp rootdata with rootbuf */ 1431 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1432 { 1433 PetscErrorCode ierr; 1434 const PetscInt *rootindices = NULL; 1435 PetscInt count,start; 1436 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1437 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL; 1438 PetscMemType rootmtype = link->rootmtype; 1439 1440 PetscFunctionBegin; 1441 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1442 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1443 if (bas->rootbuflen[scope]) { 1444 /* Do FetchAndOp on rootdata with rootbuf */ 1445 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1446 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1447 ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1448 } 1449 if (scope == PETSCSF_REMOTE) { 1450 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1451 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1452 } 1453 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1454 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1455 PetscFunctionReturn(0); 1456 } 1457 1458 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1459 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1460 { 1461 PetscErrorCode ierr; 1462 const PetscInt *rootindices = NULL,*leafindices = NULL; 1463 PetscInt count,rootstart,leafstart; 1464 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1465 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1466 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1467 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1468 1469 PetscFunctionBegin; 1470 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1471 if (rootmtype != leafmtype) { /* Uncommon case */ 1472 /* The local communication has to go through pack and unpack */ 1473 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1474 ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1475 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1476 } else { 1477 if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */ 1478 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1479 rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */ 1480 if (UnpackAndOp) { 1481 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1482 ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr); 1483 } else { 1484 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1485 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr); 1486 } 1487 } else { /* ScatterAndOp */ 1488 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1489 if (ScatterAndOp) { 1490 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1491 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1492 ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr); 1493 } else { 1494 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr); 1495 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1496 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1497 } 1498 } 1499 } 1500 PetscFunctionReturn(0); 1501 } 1502 1503 /* Reduce leafdata to rootdata locally */ 1504 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1505 { 1506 PetscErrorCode ierr; 1507 const PetscInt *rootindices = NULL,*leafindices = NULL; 1508 PetscInt count,rootstart,leafstart; 1509 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1510 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1511 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1512 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1513 1514 PetscFunctionBegin; 1515 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1516 if (rootmtype != leafmtype) { 1517 /* The local communication has to go through pack and unpack */ 1518 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1519 ierr = PetscMemcpyWithMemType(rootmtype,leafmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1520 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1521 } else { 1522 if (sf->leafcontig[PETSCSF_LOCAL]) { 1523 /* If leaf indices are contiguous, Scatter becomes Unpack */ 1524 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1525 leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */ 1526 if (UnpackAndOp) { 1527 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1528 ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr); 1529 } else { 1530 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1531 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr); 1532 } 1533 } else { 1534 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1535 if (ScatterAndOp) { 1536 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1537 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1538 ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr); 1539 } else { 1540 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1541 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr); 1542 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1543 } 1544 } 1545 } 1546 PetscFunctionReturn(0); 1547 } 1548 1549 /* Fetch rootdata to leafdata and leafupdate locally */ 1550 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1551 { 1552 PetscErrorCode ierr; 1553 const PetscInt *rootindices = NULL,*leafindices = NULL; 1554 PetscInt count,rootstart,leafstart; 1555 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1556 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL; 1557 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1558 1559 PetscFunctionBegin; 1560 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1561 if (rootmtype != leafmtype) { 1562 /* The local communication has to go through pack and unpack */ 1563 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1564 } else { 1565 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1566 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1567 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1568 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1569 } 1570 PetscFunctionReturn(0); 1571 } 1572 1573 /* 1574 Create per-rank pack/unpack optimizations based on indice patterns 1575 1576 Input Parameters: 1577 + n - Number of target ranks 1578 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1579 - idx - [*] Array storing indices 1580 1581 Output Parameters: 1582 + opt - Pack optimizations. NULL if no optimizations. 1583 */ 1584 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1585 { 1586 PetscErrorCode ierr; 1587 PetscInt i,j,k,n_copies,tot_copies = 0,step; 1588 PetscBool strided,optimized = PETSC_FALSE; 1589 PetscSFPackOpt opt; 1590 1591 PetscFunctionBegin; 1592 if (!n) { 1593 *out = NULL; 1594 PetscFunctionReturn(0); 1595 } 1596 1597 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 1598 ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 1599 ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 1600 /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */ 1601 if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} 1602 1603 opt->n = n; 1604 1605 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */ 1606 for (i=0; i<n; i++) { /* for each target processor */ 1607 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 1608 n_copies = 1; 1609 for (j=offset[i]; j<offset[i+1]-1; j++) { 1610 if (idx[j]+1 != idx[j+1]) n_copies++; 1611 } 1612 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 1613 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 1614 */ 1615 if ((offset[i+1]-offset[i])/n_copies >= 32) { 1616 opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 1617 optimized = PETSC_TRUE; 1618 tot_copies += n_copies; 1619 } 1620 } 1621 1622 /* Setup memcpy plan for each contiguous piece */ 1623 k = 0; /* k-th copy */ 1624 ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 1625 for (i=0; i<n; i++) { /* for each target processor */ 1626 if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 1627 n_copies = 1; 1628 opt->copy_start[k] = offset[i] - offset[0]; 1629 for (j=offset[i]; j<offset[i+1]-1; j++) { 1630 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 1631 n_copies++; 1632 opt->copy_start[k+1] = j-offset[0]+1; 1633 opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 1634 k++; 1635 } 1636 } 1637 /* Set copy length of the last copy for this target */ 1638 opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 1639 k++; 1640 } 1641 /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 1642 opt->copy_offset[i+1] = k; 1643 } 1644 1645 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 1646 for (i=0; i<n; i++) { /* for each remote */ 1647 if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 1648 strided = PETSC_TRUE; 1649 step = idx[offset[i]+1] - idx[offset[i]]; 1650 for (j=offset[i]; j<offset[i+1]-1; j++) { 1651 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 1652 } 1653 if (strided) { 1654 opt->type[i] = PETSCSF_PACKOPT_STRIDE; 1655 opt->stride_step[i] = step; 1656 opt->stride_n[i] = offset[i+1] - offset[i]; 1657 optimized = PETSC_TRUE; 1658 } 1659 } 1660 } 1661 /* If no rank gets optimized, free arrays to save memory */ 1662 if (!optimized) { 1663 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1664 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 1665 ierr = PetscFree(opt);CHKERRQ(ierr); 1666 *out = NULL; 1667 } else *out = opt; 1668 PetscFunctionReturn(0); 1669 } 1670 1671 PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out) 1672 { 1673 PetscErrorCode ierr; 1674 PetscSFPackOpt opt = *out; 1675 1676 PetscFunctionBegin; 1677 if (opt) { 1678 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1679 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 1680 ierr = PetscFree(opt);CHKERRQ(ierr); 1681 *out = NULL; 1682 } 1683 PetscFunctionReturn(0); 1684 } 1685 1686 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1687 { 1688 PetscErrorCode ierr; 1689 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1690 PetscInt i,j; 1691 1692 PetscFunctionBegin; 1693 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1694 for (i=0; i<2; i++) { /* Set defaults */ 1695 sf->leafstart[i] = 0; 1696 sf->leafcontig[i] = PETSC_TRUE; 1697 sf->leafdups[i] = PETSC_FALSE; 1698 bas->rootstart[i] = 0; 1699 bas->rootcontig[i] = PETSC_TRUE; 1700 bas->rootdups[i] = PETSC_FALSE; 1701 } 1702 1703 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1704 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1705 1706 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1707 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1708 1709 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1710 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1711 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1712 } 1713 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1714 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1715 } 1716 1717 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1718 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1719 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1720 1721 /* Are root indices for self and remote contiguous? */ 1722 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1723 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1724 1725 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1726 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1727 1728 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1729 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1730 } 1731 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1732 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1733 } 1734 1735 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1736 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1737 1738 #if defined(PETSC_HAVE_CUDA) 1739 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1740 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1741 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1742 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1743 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1744 #endif 1745 1746 PetscFunctionReturn(0); 1747 } 1748 1749 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1750 { 1751 PetscErrorCode ierr; 1752 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1753 PetscInt i; 1754 1755 PetscFunctionBegin; 1756 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1757 ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr); 1758 ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr); 1759 } 1760 PetscFunctionReturn(0); 1761 } 1762