1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> 7 #endif 8 /* 9 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 10 * therefore we pack data types manually. This file defines packing routines for the standard data types. 11 */ 12 13 #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 14 15 /* Operations working like s += t */ 16 #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 17 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 18 #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 19 #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 20 /* Ref MPI MAXLOC */ 21 #define OP_XLOC(op,s,t) \ 22 do { \ 23 if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 24 else if (!((s).u op (t).u)) s = t; \ 25 } while(0) 26 27 /* DEF_PackFunc - macro defining a Pack routine 28 29 Arguments of the macro: 30 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 31 .BS Block size for vectorization. It is a factor of bs. 32 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 33 34 Arguments of the Pack routine: 35 +count Number of indices in idx[]. 36 .start If indices are contiguous, it is the first index; otherwise, not used. 37 .idx Indices of entries to packed. NULL means contiguous indices, that is [start,start+count). 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .opt Per-remote pack optimization plan. NULL means no such plan. 40 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 41 -packed Address of the packed data. 42 */ 43 #define DEF_PackFunc(Type,BS,EQ) \ 44 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45 { \ 46 PetscErrorCode ierr; \ 47 const Type *u = (const Type*)unpacked,*u2; \ 48 Type *p = (Type*)packed,*p2; \ 49 PetscInt i,j,k,l,r,step,bs=link->bs; \ 50 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 52 PetscFunctionBegin; \ 53 if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54 else if (!opt) { /* No optimizations available */ \ 55 for (i=0; i<count; i++) \ 56 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59 } else { \ 60 for (r=0; r<opt->n; r++) { \ 61 p2 = p + opt->offset[r]*MBS; \ 62 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63 idx2 = idx + opt->offset[r]; \ 64 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65 for (j=0; j<M; j++) \ 66 for (k=0; k<BS; k++) \ 67 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 69 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70 u2 = u + idx[opt->copy_start[i]]*MBS; \ 71 l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72 ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73 p2 += l; \ 74 } \ 75 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76 u2 = u + idx[opt->offset[r]]*MBS; \ 77 step = opt->stride_step[r]; \ 78 for (i=0; i<opt->stride_n[r]; i++) \ 79 for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 81 } \ 82 } \ 83 PetscFunctionReturn(0); \ 84 } 85 86 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 87 and inserts into a sparse array. 88 89 Arguments: 90 .Type Type of the data 91 .BS Block size for vectorization 92 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 93 94 Notes: 95 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 96 */ 97 #define DEF_UnpackFunc(Type,BS,EQ) \ 98 static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 99 { \ 100 PetscErrorCode ierr; \ 101 Type *u = (Type*)unpacked,*u2; \ 102 const Type *p = (const Type*)packed,*p2; \ 103 PetscInt i,j,k,l,r,step,bs=link->bs; \ 104 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 105 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 106 PetscFunctionBegin; \ 107 if (!idx) { \ 108 u2 = u + start*bs; \ 109 if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);} \ 110 } else if (!opt) { /* No optimizations available */ \ 111 for (i=0; i<count; i++) \ 112 for (j=0; j<M; j++) \ 113 for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 114 } else { \ 115 for (r=0; r<opt->n; r++) { \ 116 p2 = p + opt->offset[r]*MBS; \ 117 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 118 idx2 = idx + opt->offset[r]; \ 119 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 120 for (j=0; j<M; j++) \ 121 for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 122 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 123 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 124 u2 = u + idx[opt->copy_start[i]]*MBS; \ 125 l = opt->copy_length[i]*MBS; \ 126 ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 127 p2 += l; \ 128 } \ 129 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 130 u2 = u + idx[opt->offset[r]]*MBS; \ 131 step = opt->stride_step[r]; \ 132 for (i=0; i<opt->stride_n[r]; i++) \ 133 for (j=0; j<M; j++) \ 134 for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 135 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 136 } \ 137 } \ 138 PetscFunctionReturn(0); \ 139 } 140 141 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 142 143 Arguments: 144 +Opname Name of the Op, such as Add, Mult, LAND, etc. 145 .Type Type of the data 146 .BS Block size for vectorization 147 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 148 .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 149 .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 150 */ 151 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 152 static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 153 { \ 154 Type *u = (Type*)unpacked,*u2; \ 155 const Type *p = (const Type*)packed,*p2; \ 156 PetscInt i,j,k,l,r,step,bs=link->bs; \ 157 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 158 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 159 PetscFunctionBegin; \ 160 if (!idx) { \ 161 u += start*bs; \ 162 for (i=0; i<count; i++) \ 163 for (j=0; j<M; j++) \ 164 for (k=0; k<BS; k++) \ 165 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166 } else if (!opt) { /* No optimizations available */ \ 167 for (i=0; i<count; i++) \ 168 for (j=0; j<M; j++) \ 169 for (k=0; k<BS; k++) \ 170 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 171 } else { \ 172 for (r=0; r<opt->n; r++) { \ 173 p2 = p + opt->offset[r]*MBS; \ 174 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 175 idx2 = idx + opt->offset[r]; \ 176 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 177 for (j=0; j<M; j++) \ 178 for (k=0; k<BS; k++) \ 179 OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 180 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 181 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 182 u2 = u + idx[opt->copy_start[i]]*MBS; \ 183 l = opt->copy_length[i]*MBS; \ 184 for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]); \ 185 p2 += l; \ 186 } \ 187 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 188 u2 = u + idx[opt->offset[r]]*MBS; \ 189 step = opt->stride_step[r]; \ 190 for (i=0; i<opt->stride_n[r]; i++) \ 191 for (j=0; j<M; j++) \ 192 for (k=0; k<BS; k++) \ 193 OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 194 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 195 } \ 196 } \ 197 PetscFunctionReturn(0); \ 198 } 199 200 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 201 static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \ 202 { \ 203 Type *u = (Type*)unpacked,*p = (Type*)packed,t; \ 204 PetscInt i,j,k,bs=link->bs; \ 205 const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 206 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 207 PetscFunctionBegin; \ 208 if (!idx) { \ 209 u += start*bs; \ 210 for (i=0; i<count; i++) \ 211 for (j=0; j<M; j++) \ 212 for (k=0; k<BS; k++) { \ 213 t = u[i*MBS+j*BS+k]; \ 214 OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 215 p[i*MBS+j*BS+k] = t; \ 216 } \ 217 } else { \ 218 for (i=0; i<count; i++) \ 219 for (j=0; j<M; j++) \ 220 for (k=0; k<BS; k++) { \ 221 t = u[idx[i]*MBS+j*BS+k]; \ 222 OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 223 p[i*MBS+j*BS+k] = t; \ 224 } \ 225 } \ 226 PetscFunctionReturn(0); \ 227 } 228 229 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 230 static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \ 231 { \ 232 const Type *x = (const Type*)xdata; \ 233 Type *y = (Type*)ydata; \ 234 PetscInt i,j,k,bs = link->bs; \ 235 const PetscInt M = (EQ) ? 1 : bs/BS; \ 236 const PetscInt MBS = M*BS; \ 237 PetscFunctionBegin; \ 238 if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\ 239 if (!idy) { \ 240 y += starty*bs; \ 241 for (i=0; i<count; i++) \ 242 for (j=0; j<M; j++) \ 243 for (k=0; k<BS; k++) \ 244 OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 245 } else { \ 246 for (i=0; i<count; i++) \ 247 for (j=0; j<M; j++) \ 248 for (k=0; k<BS; k++) \ 249 OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 250 } \ 251 PetscFunctionReturn(0); \ 252 } 253 254 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 255 static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \ 256 { \ 257 Type *x = (Type*)rootdata,*y2 = (Type*)leafupdate; \ 258 const Type *y = (const Type*)leafdata; \ 259 PetscInt i,j,k,bs = link->bs; \ 260 const PetscInt M = (EQ) ? 1 : bs/BS; \ 261 const PetscInt MBS = M*BS; \ 262 PetscFunctionBegin; \ 263 if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\ 264 if (!leafindices) { \ 265 y += leafstart*bs; y2 += leafstart*bs; \ 266 for (i=0; i<count; i++) \ 267 for (j=0; j<M; j++) \ 268 for (k=0; k<BS; k++) { \ 269 y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 270 OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]); \ 271 } \ 272 } else { \ 273 for (i=0; i<count; i++) \ 274 for (j=0; j<M; j++) \ 275 for (k=0; k<BS; k++) { \ 276 y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 277 OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]); \ 278 } \ 279 } \ 280 PetscFunctionReturn(0); \ 281 } 282 283 284 /* Pack, Unpack/Fetch ops */ 285 #define DEF_Pack(Type,BS,EQ) \ 286 DEF_PackFunc(Type,BS,EQ) \ 287 DEF_UnpackFunc(Type,BS,EQ) \ 288 DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 289 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 290 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 291 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 292 link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 293 } 294 295 /* Add, Mult ops */ 296 #define DEF_Add(Type,BS,EQ) \ 297 DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 298 DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 299 DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 300 DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 301 DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 302 DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 303 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 304 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 305 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 306 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 307 link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 308 link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 309 link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 310 } 311 312 /* Max, Min ops */ 313 #define DEF_Cmp(Type,BS,EQ) \ 314 DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 315 DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 316 DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 317 DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 318 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 319 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 320 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 321 link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 322 link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 323 } 324 325 /* Logical ops. 326 The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 327 the compilation warning "empty macro arguments are undefined in ISO C90" 328 */ 329 #define DEF_Log(Type,BS,EQ) \ 330 DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 331 DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 332 DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 333 DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 334 DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 335 DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 336 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 337 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 338 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 339 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 340 link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 341 link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 342 link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 343 } 344 345 /* Bitwise ops */ 346 #define DEF_Bit(Type,BS,EQ) \ 347 DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 348 DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 349 DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 350 DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 351 DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 352 DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 353 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 354 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 355 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 356 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 357 link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 358 link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 359 link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 360 } 361 362 /* Maxloc, Minloc ops */ 363 #define DEF_Xloc(Type,BS,EQ) \ 364 DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 365 DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 366 DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 367 DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 368 static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 369 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 370 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 371 link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 372 link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 373 } 374 375 #define DEF_IntegerType(Type,BS,EQ) \ 376 DEF_Pack(Type,BS,EQ) \ 377 DEF_Add(Type,BS,EQ) \ 378 DEF_Cmp(Type,BS,EQ) \ 379 DEF_Log(Type,BS,EQ) \ 380 DEF_Bit(Type,BS,EQ) \ 381 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 382 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 383 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 384 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 385 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 386 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 387 } 388 389 #define DEF_RealType(Type,BS,EQ) \ 390 DEF_Pack(Type,BS,EQ) \ 391 DEF_Add(Type,BS,EQ) \ 392 DEF_Cmp(Type,BS,EQ) \ 393 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 394 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 395 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 396 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 397 } 398 399 #if defined(PETSC_HAVE_COMPLEX) 400 #define DEF_ComplexType(Type,BS,EQ) \ 401 DEF_Pack(Type,BS,EQ) \ 402 DEF_Add(Type,BS,EQ) \ 403 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 404 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 405 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 406 } 407 #endif 408 409 #define DEF_DumbType(Type,BS,EQ) \ 410 DEF_Pack(Type,BS,EQ) \ 411 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 412 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 413 } 414 415 /* Maxloc, Minloc */ 416 #define DEF_PairType(Type,BS,EQ) \ 417 DEF_Pack(Type,BS,EQ) \ 418 DEF_Xloc(Type,BS,EQ) \ 419 static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 420 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 421 CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 422 } 423 424 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 425 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 426 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 427 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 428 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 429 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 430 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 431 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 432 433 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 434 DEF_IntegerType(int,1,1) 435 DEF_IntegerType(int,2,1) 436 DEF_IntegerType(int,4,1) 437 DEF_IntegerType(int,8,1) 438 DEF_IntegerType(int,1,0) 439 DEF_IntegerType(int,2,0) 440 DEF_IntegerType(int,4,0) 441 DEF_IntegerType(int,8,0) 442 #endif 443 444 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 445 typedef signed char SignedChar; 446 DEF_IntegerType(SignedChar,1,1) 447 DEF_IntegerType(SignedChar,2,1) 448 DEF_IntegerType(SignedChar,4,1) 449 DEF_IntegerType(SignedChar,8,1) 450 DEF_IntegerType(SignedChar,1,0) 451 DEF_IntegerType(SignedChar,2,0) 452 DEF_IntegerType(SignedChar,4,0) 453 DEF_IntegerType(SignedChar,8,0) 454 455 typedef unsigned char UnsignedChar; 456 DEF_IntegerType(UnsignedChar,1,1) 457 DEF_IntegerType(UnsignedChar,2,1) 458 DEF_IntegerType(UnsignedChar,4,1) 459 DEF_IntegerType(UnsignedChar,8,1) 460 DEF_IntegerType(UnsignedChar,1,0) 461 DEF_IntegerType(UnsignedChar,2,0) 462 DEF_IntegerType(UnsignedChar,4,0) 463 DEF_IntegerType(UnsignedChar,8,0) 464 465 DEF_RealType(PetscReal,1,1) 466 DEF_RealType(PetscReal,2,1) 467 DEF_RealType(PetscReal,4,1) 468 DEF_RealType(PetscReal,8,1) 469 DEF_RealType(PetscReal,1,0) 470 DEF_RealType(PetscReal,2,0) 471 DEF_RealType(PetscReal,4,0) 472 DEF_RealType(PetscReal,8,0) 473 474 #if defined(PETSC_HAVE_COMPLEX) 475 DEF_ComplexType(PetscComplex,1,1) 476 DEF_ComplexType(PetscComplex,2,1) 477 DEF_ComplexType(PetscComplex,4,1) 478 DEF_ComplexType(PetscComplex,8,1) 479 DEF_ComplexType(PetscComplex,1,0) 480 DEF_ComplexType(PetscComplex,2,0) 481 DEF_ComplexType(PetscComplex,4,0) 482 DEF_ComplexType(PetscComplex,8,0) 483 #endif 484 485 #define PairType(Type1,Type2) Type1##_##Type2 486 typedef struct {int u; int i;} PairType(int,int); 487 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 488 DEF_PairType(PairType(int,int),1,1) 489 DEF_PairType(PairType(PetscInt,PetscInt),1,1) 490 491 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 492 DEF_DumbType(char,1,1) 493 DEF_DumbType(char,2,1) 494 DEF_DumbType(char,4,1) 495 DEF_DumbType(char,1,0) 496 DEF_DumbType(char,2,0) 497 DEF_DumbType(char,4,0) 498 499 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 500 DEF_DumbType(DumbInt,1,1) 501 DEF_DumbType(DumbInt,2,1) 502 DEF_DumbType(DumbInt,4,1) 503 DEF_DumbType(DumbInt,8,1) 504 DEF_DumbType(DumbInt,1,0) 505 DEF_DumbType(DumbInt,2,0) 506 DEF_DumbType(DumbInt,4,0) 507 DEF_DumbType(DumbInt,8,0) 508 509 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 510 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 511 { 512 int ierr; 513 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 514 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 515 return MPI_SUCCESS; 516 } 517 #endif 518 519 /* 520 The routine Creates a communication link for the given operation. It first looks up its link cache. If 521 there is a free & suitable one, it uses it. Otherwise it creates a new one. 522 523 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 524 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 525 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 526 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 527 528 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 529 530 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 531 532 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 533 need pack/unpack data. 534 */ 535 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 536 { 537 PetscErrorCode ierr; 538 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 539 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 540 PetscSFLink *p,link; 541 PetscSFDirection direction; 542 MPI_Request *reqs = NULL; 543 PetscBool match,rootdirect[2],leafdirect[2]; 544 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 545 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 546 547 PetscFunctionBegin; 548 ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 549 550 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 551 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 552 if (sfop == PETSCSF_BCAST) { 553 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 554 leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 555 } else if (sfop == PETSCSF_REDUCE) { 556 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 557 rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 558 } else { /* PETSCSF_FETCH */ 559 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 560 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 561 } 562 } 563 564 if (use_gpu_aware_mpi) { 565 rootmtype_mpi = rootmtype; 566 leafmtype_mpi = leafmtype; 567 } else { 568 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 569 } 570 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 571 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 572 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 573 574 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 575 nrootreqs = bas->nrootreqs; 576 nleafreqs = sf->nleafreqs; 577 578 /* Look for free links in cache */ 579 for (p=&bas->avail; (link=*p); p=&link->next) { 580 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 581 if (match) { 582 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 583 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 584 */ 585 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 586 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 587 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 588 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 589 } 590 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 591 reqs = link->leafreqs[direction][leafmtype][1]; 592 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 593 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 594 } 595 *p = link->next; /* Remove from available list */ 596 goto found; 597 } 598 } 599 600 ierr = PetscNew(&link);CHKERRQ(ierr); 601 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 602 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 603 604 nreqs = (nrootreqs+nleafreqs)*8; 605 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 606 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 607 608 for (i=0; i<2; i++) { /* Two communication directions */ 609 for (j=0; j<2; j++) { /* Two memory types */ 610 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 611 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 612 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 613 } 614 } 615 } 616 617 found: 618 if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 619 620 /* Allocate buffers along root/leafdata */ 621 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 622 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 623 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 624 if (bas->rootbuflen[i]) { 625 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 626 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 627 } else { /* Have to have a separate rootbuf */ 628 if (!link->rootbuf_alloc[i][rootmtype]) { 629 ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 630 } 631 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 632 } 633 } 634 635 if (sf->leafbuflen[i]) { 636 if (leafdirect[i]) { 637 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 638 } else { 639 if (!link->leafbuf_alloc[i][leafmtype]) { 640 ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 641 } 642 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 643 } 644 } 645 } 646 647 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 648 if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 649 if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 650 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 651 } 652 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 653 } 654 if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 655 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 656 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 657 } 658 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 659 } 660 661 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 662 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 663 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 664 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 665 } 666 667 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 668 link->leafdata = leafdata; 669 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 670 link->rootdirect[i] = rootdirect[i]; 671 link->leafdirect[i] = leafdirect[i]; 672 } 673 link->rootdirect_mpi = rootdirect_mpi; 674 link->leafdirect_mpi = leafdirect_mpi; 675 link->rootmtype = rootmtype; 676 link->leafmtype = leafmtype; 677 link->rootmtype_mpi = rootmtype_mpi; 678 link->leafmtype_mpi = leafmtype_mpi; 679 680 link->next = bas->inuse; 681 bas->inuse = link; 682 *mylink = link; 683 PetscFunctionReturn(0); 684 } 685 686 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 687 If the sf uses persistent requests and the requests have not been initialized, then initialize them. 688 */ 689 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 690 { 691 PetscErrorCode ierr; 692 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 693 PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 694 const PetscInt *rootoffset,*leafoffset; 695 PetscMPIInt n; 696 MPI_Aint disp; 697 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 698 MPI_Datatype unit = link->unit; 699 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 700 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 701 702 PetscFunctionBegin; 703 /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 704 if (sf->persistent) { 705 if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 706 ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 707 if (direction == PETSCSF_LEAF2ROOT) { 708 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 709 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 710 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 711 ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 712 } 713 } else { /* PETSCSF_ROOT2LEAF */ 714 for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 715 disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 716 ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 717 ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 718 } 719 } 720 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 721 } 722 723 if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 724 ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 725 if (direction == PETSCSF_LEAF2ROOT) { 726 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 727 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 728 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 729 ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 730 } 731 } else { /* PETSCSF_ROOT2LEAF */ 732 for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 733 disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 734 ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 735 ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 736 } 737 } 738 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 739 } 740 } 741 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 742 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 743 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 744 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 745 PetscFunctionReturn(0); 746 } 747 748 749 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 750 { 751 PetscErrorCode ierr; 752 PetscSFLink link,*p; 753 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 754 755 PetscFunctionBegin; 756 /* Look for types in cache */ 757 for (p=&bas->inuse; (link=*p); p=&link->next) { 758 PetscBool match; 759 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 760 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 761 switch (cmode) { 762 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 763 case PETSC_USE_POINTER: break; 764 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 765 } 766 *mylink = link; 767 PetscFunctionReturn(0); 768 } 769 } 770 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 771 PetscFunctionReturn(0); 772 } 773 774 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 775 { 776 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 777 778 PetscFunctionBegin; 779 (*link)->rootdata = NULL; 780 (*link)->leafdata = NULL; 781 (*link)->next = bas->avail; 782 bas->avail = *link; 783 *link = NULL; 784 PetscFunctionReturn(0); 785 } 786 787 /* Destroy all links chained in 'avail' */ 788 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 789 { 790 PetscErrorCode ierr; 791 PetscSFLink link = *avail,next; 792 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 793 PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 794 795 PetscFunctionBegin; 796 for (; link; link=next) { 797 next = link->next; 798 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 799 for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 800 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 801 } 802 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 803 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 804 #if defined(PETSC_HAVE_CUDA) 805 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 806 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 807 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 808 #endif 809 ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 810 ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 811 } 812 ierr = PetscFree(link);CHKERRQ(ierr); 813 } 814 *avail = NULL; 815 PetscFunctionReturn(0); 816 } 817 818 #if defined(PETSC_USE_DEBUG) 819 /* Error out on unsupported overlapped communications */ 820 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 821 { 822 PetscErrorCode ierr; 823 PetscSFLink link,*p; 824 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 825 PetscBool match; 826 827 PetscFunctionBegin; 828 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 829 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 830 */ 831 if (rootdata || leafdata) { 832 for (p=&bas->inuse; (link=*p); p=&link->next) { 833 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 834 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 835 } 836 } 837 PetscFunctionReturn(0); 838 } 839 #endif 840 841 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 842 { 843 PetscErrorCode ierr; 844 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 845 PetscBool is2Int,is2PetscInt; 846 PetscMPIInt ni,na,nd,combiner; 847 #if defined(PETSC_HAVE_COMPLEX) 848 PetscInt nPetscComplex=0; 849 #endif 850 851 PetscFunctionBegin; 852 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 853 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 854 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 855 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 856 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 857 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 858 #if defined(PETSC_HAVE_COMPLEX) 859 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 860 #endif 861 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 862 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 863 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 864 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 865 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 866 link->bs = 1; /* default */ 867 868 if (is2Int) { 869 PackInit_PairType_int_int_1_1(link); 870 link->bs = 1; 871 link->unitbytes = 2*sizeof(int); 872 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 873 link->basicunit = MPI_2INT; 874 link->unit = MPI_2INT; 875 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 876 PackInit_PairType_PetscInt_PetscInt_1_1(link); 877 link->bs = 1; 878 link->unitbytes = 2*sizeof(PetscInt); 879 link->basicunit = MPIU_2INT; 880 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 881 link->unit = MPIU_2INT; 882 } else if (nPetscReal) { 883 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 884 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 885 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 886 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 887 link->bs = nPetscReal; 888 link->unitbytes = nPetscReal*sizeof(PetscReal); 889 link->basicunit = MPIU_REAL; 890 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 891 } else if (nPetscInt) { 892 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 893 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 894 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 895 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 896 link->bs = nPetscInt; 897 link->unitbytes = nPetscInt*sizeof(PetscInt); 898 link->basicunit = MPIU_INT; 899 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 900 #if defined(PETSC_USE_64BIT_INDICES) 901 } else if (nInt) { 902 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 903 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 904 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 905 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 906 link->bs = nInt; 907 link->unitbytes = nInt*sizeof(int); 908 link->basicunit = MPI_INT; 909 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 910 #endif 911 } else if (nSignedChar) { 912 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 913 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 914 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 915 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 916 link->bs = nSignedChar; 917 link->unitbytes = nSignedChar*sizeof(SignedChar); 918 link->basicunit = MPI_SIGNED_CHAR; 919 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 920 } else if (nUnsignedChar) { 921 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 922 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 923 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 924 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 925 link->bs = nUnsignedChar; 926 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 927 link->basicunit = MPI_UNSIGNED_CHAR; 928 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 929 #if defined(PETSC_HAVE_COMPLEX) 930 } else if (nPetscComplex) { 931 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 932 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 933 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 934 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 935 link->bs = nPetscComplex; 936 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 937 link->basicunit = MPIU_COMPLEX; 938 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 939 #endif 940 } else { 941 MPI_Aint lb,nbyte; 942 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 943 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 944 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 945 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 946 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 947 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 948 link->bs = nbyte; 949 link->unitbytes = nbyte; 950 link->basicunit = MPI_BYTE; 951 } else { 952 nInt = nbyte / sizeof(int); 953 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 954 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 955 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 956 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 957 link->bs = nInt; 958 link->unitbytes = nbyte; 959 link->basicunit = MPI_INT; 960 } 961 if (link->isbuiltin) link->unit = unit; 962 } 963 964 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 965 PetscFunctionReturn(0); 966 } 967 968 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*)) 969 { 970 PetscFunctionBegin; 971 *UnpackAndOp = NULL; 972 if (mtype == PETSC_MEMTYPE_HOST) { 973 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 974 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 975 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 976 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 977 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 978 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 979 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 980 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 981 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 982 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 983 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 984 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 985 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 986 } 987 #if defined(PETSC_HAVE_CUDA) 988 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 989 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 990 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 991 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 992 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 993 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 994 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 995 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 996 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 997 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 998 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 999 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 1000 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 1001 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 1002 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1003 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 1004 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 1005 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 1006 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 1007 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 1008 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 1009 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 1010 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 1011 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 1012 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1013 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1014 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1015 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1016 } 1017 #endif 1018 PetscFunctionReturn(0); 1019 } 1020 1021 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*)) 1022 { 1023 PetscFunctionBegin; 1024 *ScatterAndOp = NULL; 1025 if (mtype == PETSC_MEMTYPE_HOST) { 1026 if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1027 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1028 else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1029 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1030 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1031 else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1032 else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1033 else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1034 else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1035 else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1036 else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1037 else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1038 else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1039 } 1040 #if defined(PETSC_HAVE_CUDA) 1041 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1042 if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1043 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1044 else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1045 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1046 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1047 else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1048 else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1049 else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1050 else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1051 else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1052 else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1053 else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1054 else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1055 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1056 if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1057 else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1058 else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1059 else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1060 else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1061 else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1062 else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1063 else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1064 else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1065 else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1066 else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1067 else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1068 else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1069 } 1070 #endif 1071 PetscFunctionReturn(0); 1072 } 1073 1074 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*)) 1075 { 1076 PetscFunctionBegin; 1077 *FetchAndOp = NULL; 1078 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1079 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1080 #if defined(PETSC_HAVE_CUDA) 1081 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1082 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1083 #endif 1084 PetscFunctionReturn(0); 1085 } 1086 1087 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*)) 1088 { 1089 PetscFunctionBegin; 1090 *FetchAndOpLocal = NULL; 1091 if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1092 if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1093 #if defined(PETSC_HAVE_CUDA) 1094 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1095 else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1096 #endif 1097 PetscFunctionReturn(0); 1098 } 1099 1100 /*============================================================================= 1101 A set of helper routines for Pack/Unpack/Scatter on GPUs 1102 ============================================================================*/ 1103 #if defined(PETSC_HAVE_CUDA) 1104 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1105 make sure the data is ready for packing. 1106 */ 1107 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1108 { 1109 PetscFunctionBegin; 1110 if (sf->use_default_stream) PetscFunctionReturn(0); 1111 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1112 cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1113 } 1114 PetscFunctionReturn(0); 1115 } 1116 1117 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1118 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1119 { 1120 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1121 1122 PetscFunctionBegin; 1123 /* Do nothing if we use stream aware mpi || has nothing for remote */ 1124 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1125 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 1126 if (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) { 1127 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1128 } 1129 PetscFunctionReturn(0); 1130 } 1131 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1132 { 1133 PetscFunctionBegin; 1134 /* See comments above */ 1135 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1136 if (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) { 1137 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1138 } 1139 PetscFunctionReturn(0); 1140 } 1141 1142 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1143 does not know which stream the callers will use. 1144 */ 1145 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1146 { 1147 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1148 1149 PetscFunctionBegin; 1150 /* Do nothing if we are expected to put rootdata on default stream */ 1151 if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1152 /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it; 1153 If we have something from remote and we called unpack kernel, then we must also sycn it. 1154 */ 1155 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) { 1156 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1157 } 1158 PetscFunctionReturn(0); 1159 } 1160 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1161 { 1162 PetscFunctionBegin; 1163 /* See comments above */ 1164 if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1165 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) { 1166 cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1167 } 1168 PetscFunctionReturn(0); 1169 } 1170 1171 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1172 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1173 */ 1174 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1175 { 1176 PetscErrorCode ierr; 1177 cudaError_t cerr; 1178 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1179 1180 PetscFunctionBegin; 1181 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1182 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1183 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1184 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1185 if (device2host) { 1186 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1187 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1188 } else { 1189 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1190 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1191 } 1192 } 1193 PetscFunctionReturn(0); 1194 } 1195 1196 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1197 { 1198 PetscErrorCode ierr; 1199 cudaError_t cerr; 1200 1201 PetscFunctionBegin; 1202 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1203 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1204 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1205 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1206 if (device2host) { 1207 cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1208 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1209 } else { 1210 cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1211 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1212 } 1213 } 1214 PetscFunctionReturn(0); 1215 } 1216 #else 1217 1218 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1219 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1220 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1221 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1222 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1223 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1224 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1225 1226 #endif 1227 1228 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1229 { 1230 PetscErrorCode ierr; 1231 PetscLogDouble flops; 1232 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1233 1234 1235 PetscFunctionBegin; 1236 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1237 flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1238 #if defined(PETSC_HAVE_CUDA) 1239 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1240 #endif 1241 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1242 } 1243 PetscFunctionReturn(0); 1244 } 1245 1246 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1247 { 1248 PetscLogDouble flops; 1249 PetscErrorCode ierr; 1250 1251 PetscFunctionBegin; 1252 if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1253 flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1254 #if defined(PETSC_HAVE_CUDA) 1255 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1256 #endif 1257 {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1258 } 1259 PetscFunctionReturn(0); 1260 } 1261 1262 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1263 Input Arguments: 1264 +sf - The StarForest 1265 .link - The link 1266 .count - Number of entries to unpack 1267 .start - The first index, significent when indices=NULL 1268 .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1269 .buf - A contiguous buffer to unpack from 1270 -op - Operation after unpack 1271 1272 Output Arguments: 1273 .data - The data to unpack to 1274 */ 1275 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1276 { 1277 PetscFunctionBegin; 1278 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1279 { 1280 PetscErrorCode ierr; 1281 PetscInt i; 1282 PetscMPIInt n; 1283 if (indices) { 1284 /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1285 basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1286 */ 1287 for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1288 } else { 1289 ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1290 ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1291 } 1292 } 1293 #else 1294 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1295 #endif 1296 PetscFunctionReturn(0); 1297 } 1298 1299 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op) 1300 { 1301 PetscFunctionBegin; 1302 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1303 { 1304 PetscErrorCode ierr; 1305 PetscInt i,disp; 1306 for (i=0; i<count; i++) { 1307 disp = idy? idy[i] : starty + i; 1308 ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1309 } 1310 } 1311 #else 1312 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1313 #endif 1314 PetscFunctionReturn(0); 1315 } 1316 1317 /*============================================================================= 1318 Pack/Unpack/Fetch/Scatter routines 1319 ============================================================================*/ 1320 1321 /* Pack rootdata to rootbuf 1322 Input Arguments: 1323 + sf - The SF this packing works on. 1324 . link - It gives the memtype of the roots and also provides root buffer. 1325 . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1326 - rootdata - Where to read the roots. 1327 1328 Notes: 1329 When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1330 in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1331 */ 1332 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1333 { 1334 PetscErrorCode ierr; 1335 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1336 const PetscInt *rootindices = NULL; 1337 PetscInt count,start; 1338 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1339 PetscMemType rootmtype = link->rootmtype; 1340 1341 PetscFunctionBegin; 1342 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1343 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1344 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1345 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1346 ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1347 ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1348 } 1349 if (scope == PETSCSF_REMOTE) { 1350 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1351 ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1352 } 1353 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1354 PetscFunctionReturn(0); 1355 } 1356 1357 /* Pack leafdata to leafbuf */ 1358 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1359 { 1360 PetscErrorCode ierr; 1361 const PetscInt *leafindices = NULL; 1362 PetscInt count,start; 1363 PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1364 PetscMemType leafmtype = link->leafmtype; 1365 1366 PetscFunctionBegin; 1367 ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1368 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1369 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1370 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1371 ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1372 ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1373 } 1374 if (scope == PETSCSF_REMOTE) { 1375 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1376 ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1377 } 1378 ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 /* Unpack rootbuf to rootdata */ 1383 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1384 { 1385 PetscErrorCode ierr; 1386 const PetscInt *rootindices = NULL; 1387 PetscInt count,start; 1388 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1389 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1390 PetscMemType rootmtype = link->rootmtype; 1391 1392 PetscFunctionBegin; 1393 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1394 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1395 if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1396 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1397 if (UnpackAndOp) { 1398 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1399 ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1400 } else { 1401 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1402 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1403 } 1404 } 1405 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1406 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1407 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1408 PetscFunctionReturn(0); 1409 } 1410 1411 /* Unpack leafbuf to leafdata */ 1412 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1413 { 1414 PetscErrorCode ierr; 1415 const PetscInt *leafindices = NULL; 1416 PetscInt count,start; 1417 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1418 PetscMemType leafmtype = link->leafmtype; 1419 1420 PetscFunctionBegin; 1421 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1422 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1423 if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1424 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1425 if (UnpackAndOp) { 1426 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1427 ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1428 } else { 1429 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1430 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1431 } 1432 } 1433 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1434 ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1435 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1436 PetscFunctionReturn(0); 1437 } 1438 1439 /* FetchAndOp rootdata with rootbuf */ 1440 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1441 { 1442 PetscErrorCode ierr; 1443 const PetscInt *rootindices = NULL; 1444 PetscInt count,start; 1445 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1446 PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL; 1447 PetscMemType rootmtype = link->rootmtype; 1448 1449 PetscFunctionBegin; 1450 ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1451 if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1452 if (bas->rootbuflen[scope]) { 1453 /* Do FetchAndOp on rootdata with rootbuf */ 1454 ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1455 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1456 ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1457 } 1458 if (scope == PETSCSF_REMOTE) { 1459 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1460 ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1461 } 1462 ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1463 ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1464 PetscFunctionReturn(0); 1465 } 1466 1467 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1468 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1469 { 1470 PetscErrorCode ierr; 1471 const PetscInt *rootindices = NULL,*leafindices = NULL; 1472 PetscInt count,rootstart,leafstart; 1473 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1474 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1475 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1476 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1477 1478 PetscFunctionBegin; 1479 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1480 if (rootmtype != leafmtype) { /* Uncommon case */ 1481 /* The local communication has to go through pack and unpack */ 1482 ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1483 ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1484 ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1485 } else { 1486 if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */ 1487 ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1488 rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */ 1489 if (UnpackAndOp) { 1490 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1491 ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr); 1492 } else { 1493 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1494 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr); 1495 } 1496 } else { /* ScatterAndOp */ 1497 ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1498 if (ScatterAndOp) { 1499 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1500 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1501 ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr); 1502 } else { 1503 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr); 1504 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1505 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1506 } 1507 } 1508 } 1509 PetscFunctionReturn(0); 1510 } 1511 1512 /* Reduce leafdata to rootdata locally */ 1513 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1514 { 1515 PetscErrorCode ierr; 1516 const PetscInt *rootindices = NULL,*leafindices = NULL; 1517 PetscInt count,rootstart,leafstart; 1518 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1519 PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1520 PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1521 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1522 1523 PetscFunctionBegin; 1524 if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1525 if (rootmtype != leafmtype) { 1526 /* The local communication has to go through pack and unpack */ 1527 ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1528 ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1529 ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1530 } else { 1531 if (sf->leafcontig[PETSCSF_LOCAL]) { 1532 /* If leaf indices are contiguous, Scatter becomes Unpack */ 1533 ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1534 leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */ 1535 if (UnpackAndOp) { 1536 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1537 ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr); 1538 } else { 1539 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1540 ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr); 1541 } 1542 } else { 1543 ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1544 if (ScatterAndOp) { 1545 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1546 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1547 ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr); 1548 } else { 1549 ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1550 ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr); 1551 ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1552 } 1553 } 1554 } 1555 PetscFunctionReturn(0); 1556 } 1557 1558 /* Fetch rootdata to leafdata and leafupdate locally */ 1559 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1560 { 1561 PetscErrorCode ierr; 1562 const PetscInt *rootindices = NULL,*leafindices = NULL; 1563 PetscInt count,rootstart,leafstart; 1564 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1565 PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL; 1566 const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1567 1568 PetscFunctionBegin; 1569 if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1570 if (rootmtype != leafmtype) { 1571 /* The local communication has to go through pack and unpack */ 1572 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1573 } else { 1574 ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1575 ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1576 ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1577 ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1578 } 1579 PetscFunctionReturn(0); 1580 } 1581 1582 /* 1583 Create per-rank pack/unpack optimizations based on indice patterns 1584 1585 Input Parameters: 1586 + n - Number of target ranks 1587 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1588 - idx - [*] Array storing indices 1589 1590 Output Parameters: 1591 + opt - Pack optimizations. NULL if no optimizations. 1592 */ 1593 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 1594 { 1595 PetscErrorCode ierr; 1596 PetscInt i,j,k,n_copies,tot_copies = 0,step; 1597 PetscBool strided,optimized = PETSC_FALSE; 1598 PetscSFPackOpt opt; 1599 1600 PetscFunctionBegin; 1601 if (!n) { 1602 *out = NULL; 1603 PetscFunctionReturn(0); 1604 } 1605 1606 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 1607 ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 1608 ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 1609 /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */ 1610 if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} 1611 1612 opt->n = n; 1613 1614 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */ 1615 for (i=0; i<n; i++) { /* for each target processor */ 1616 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 1617 n_copies = 1; 1618 for (j=offset[i]; j<offset[i+1]-1; j++) { 1619 if (idx[j]+1 != idx[j+1]) n_copies++; 1620 } 1621 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 1622 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 1623 */ 1624 if ((offset[i+1]-offset[i])/n_copies >= 32) { 1625 opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 1626 optimized = PETSC_TRUE; 1627 tot_copies += n_copies; 1628 } 1629 } 1630 1631 /* Setup memcpy plan for each contiguous piece */ 1632 k = 0; /* k-th copy */ 1633 ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 1634 for (i=0; i<n; i++) { /* for each target processor */ 1635 if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 1636 n_copies = 1; 1637 opt->copy_start[k] = offset[i] - offset[0]; 1638 for (j=offset[i]; j<offset[i+1]-1; j++) { 1639 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 1640 n_copies++; 1641 opt->copy_start[k+1] = j-offset[0]+1; 1642 opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 1643 k++; 1644 } 1645 } 1646 /* Set copy length of the last copy for this target */ 1647 opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 1648 k++; 1649 } 1650 /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 1651 opt->copy_offset[i+1] = k; 1652 } 1653 1654 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 1655 for (i=0; i<n; i++) { /* for each remote */ 1656 if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 1657 strided = PETSC_TRUE; 1658 step = idx[offset[i]+1] - idx[offset[i]]; 1659 for (j=offset[i]; j<offset[i+1]-1; j++) { 1660 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 1661 } 1662 if (strided) { 1663 opt->type[i] = PETSCSF_PACKOPT_STRIDE; 1664 opt->stride_step[i] = step; 1665 opt->stride_n[i] = offset[i+1] - offset[i]; 1666 optimized = PETSC_TRUE; 1667 } 1668 } 1669 } 1670 /* If no rank gets optimized, free arrays to save memory */ 1671 if (!optimized) { 1672 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1673 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 1674 ierr = PetscFree(opt);CHKERRQ(ierr); 1675 *out = NULL; 1676 } else *out = opt; 1677 PetscFunctionReturn(0); 1678 } 1679 1680 PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out) 1681 { 1682 PetscErrorCode ierr; 1683 PetscSFPackOpt opt = *out; 1684 1685 PetscFunctionBegin; 1686 if (opt) { 1687 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1688 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 1689 ierr = PetscFree(opt);CHKERRQ(ierr); 1690 *out = NULL; 1691 } 1692 PetscFunctionReturn(0); 1693 } 1694 1695 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1696 { 1697 PetscErrorCode ierr; 1698 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1699 PetscInt i,j; 1700 1701 PetscFunctionBegin; 1702 /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1703 for (i=0; i<2; i++) { /* Set defaults */ 1704 sf->leafstart[i] = 0; 1705 sf->leafcontig[i] = PETSC_TRUE; 1706 sf->leafdups[i] = PETSC_FALSE; 1707 bas->rootstart[i] = 0; 1708 bas->rootcontig[i] = PETSC_TRUE; 1709 bas->rootdups[i] = PETSC_FALSE; 1710 } 1711 1712 sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1713 sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1714 1715 if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1716 if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1717 1718 /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1719 for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1720 if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1721 } 1722 for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1723 if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1724 } 1725 1726 /* If not, see if we can have per-rank optimizations by doing index analysis */ 1727 if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1728 if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1729 1730 /* Are root indices for self and remote contiguous? */ 1731 bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1732 bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1733 1734 if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1735 if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1736 1737 for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1738 if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1739 } 1740 for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1741 if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1742 } 1743 1744 if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1745 if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1746 1747 #if defined(PETSC_HAVE_CUDA) 1748 /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1749 if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1750 if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1751 if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1752 if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1753 #endif 1754 1755 PetscFunctionReturn(0); 1756 } 1757 1758 PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1759 { 1760 PetscErrorCode ierr; 1761 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1762 PetscInt i; 1763 1764 PetscFunctionBegin; 1765 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1766 ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr); 1767 ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr); 1768 } 1769 PetscFunctionReturn(0); 1770 } 1771