1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* 6 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 7 * therefore we pack data types manually. This file defines packing routines for the standard data types. 8 */ 9 10 #define CPPJoin2(a,b) a ##_## b 11 #define CPPJoin3(a,b,c) a ##_## b ##_## c 12 #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d 13 #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d) 14 15 #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 16 #define IGNORE(statement) do {} while(0) 17 18 #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 19 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 20 #define LXOR_OP(r,s,op,t) do {(r) = (!s) != (!t);} while(0) /* logical exclusive OR */ 21 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 22 23 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */ 24 25 /* DEF_PackFunc - macro defining a Pack routine 26 27 Arguments of the macro: 28 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 29 .BS Block size for vectorization. It is a factor of bs. 30 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 31 32 Arguments of the Pack routine: 33 +count Number of indices in idx[] 34 .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count) 35 .bs Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int 36 .opt Pack optimization plans. NULL means no plan at all. 37 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count) 38 -packed Address of the packed data for each rank 39 */ 40 #define DEF_PackFunc(Type,BS,EQ) \ 41 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 42 { \ 43 PetscErrorCode ierr; \ 44 const Type *u = (const Type*)unpacked,*u2; \ 45 Type *p = (Type*)packed,*p2; \ 46 PetscInt i,j,k,l,r,step; \ 47 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 48 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 49 PetscFunctionBegin; \ 50 if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 51 else if (!opt) { /* No optimizations available */ \ 52 for (i=0; i<count; i++) \ 53 for (j=0; j<M; j++) /* Decent compilers should fuse this loop when M = const 1 */ \ 54 for (k=0; k<BS; k++) /* Compiler either fuses (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 55 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 56 } else { \ 57 for (r=0; r<opt->n; r++) { \ 58 p2 = p + opt->offset[r]*MBS; \ 59 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 60 idx2 = idx + opt->offset[r]; \ 61 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 62 for (j=0; j<M; j++) \ 63 for (k=0; k<BS; k++) \ 64 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 65 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 66 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 67 u2 = u + idx[opt->copy_start[i]]*MBS; \ 68 l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 69 ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 70 p2 += l; \ 71 } \ 72 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 73 u2 = u + idx[opt->offset[r]]*MBS; \ 74 step = opt->stride_step[r]; \ 75 for (i=0; i<opt->stride_n[r]; i++) \ 76 for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 77 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 78 } \ 79 } \ 80 PetscFunctionReturn(0); \ 81 } 82 83 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 84 85 Arguments: 86 +action Unpack or Fetch 87 .Type Type of the data 88 .BS Block size for vectorization 89 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 90 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 91 .CType Type with or without the const qualifier, i.e., const Type or Type 92 .Cvoid void with or without the const qualifier, i.e., const void or void 93 94 Notes: 95 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 96 The two arguments CType and Cvoid are used (instead of one constness argument), because we want to 97 get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 98 sometimes we input 'const', sometimes we have to input empty. 99 100 If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare. 101 */ 102 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \ 103 static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 104 { \ 105 PetscErrorCode ierr; \ 106 Type *u = (Type*)unpacked,*u2; \ 107 CType *p = (CType*)packed,*p2; \ 108 PetscInt i,j,k,l,r,step; \ 109 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 110 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 111 PetscFunctionBegin; \ 112 if (!idx) { \ 113 FILTER(Type *v); \ 114 FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \ 115 FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \ 116 ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr); \ 117 FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \ 118 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 119 } else if (!opt) { /* No optimizations available */ \ 120 for (i=0; i<count; i++) \ 121 for (j=0; j<M; j++) \ 122 for (k=0; k<BS; k++) { \ 123 FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \ 124 u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 125 FILTER(p[i*MBS+j*BS+k] = t); \ 126 } \ 127 } else { \ 128 for (r=0; r<opt->n; r++) { \ 129 p2 = p + opt->offset[r]*MBS; \ 130 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 131 idx2 = idx + opt->offset[r]; \ 132 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 133 for (j=0; j<M; j++) \ 134 for (k=0; k<BS; k++) { \ 135 FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \ 136 u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 137 FILTER(p2[i*MBS+j*BS+k] = t); \ 138 } \ 139 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 140 FILTER(Type *v); \ 141 FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \ 142 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 143 u2 = u + idx[opt->copy_start[i]]*MBS; \ 144 l = opt->copy_length[i]*MBS; \ 145 FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 146 ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 147 FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \ 148 p2 += l; \ 149 } \ 150 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 151 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 152 u2 = u + idx[opt->offset[r]]*MBS; \ 153 step = opt->stride_step[r]; \ 154 for (i=0; i<opt->stride_n[r]; i++) \ 155 for (j=0; j<M; j++) \ 156 for (k=0; k<BS; k++) { \ 157 FILTER(Type t = u2[i*step*MBS+j*BS+k]); \ 158 u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 159 FILTER(p2[i*MBS+j*BS+k] = t); \ 160 } \ 161 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 162 } \ 163 } \ 164 PetscFunctionReturn(0); \ 165 } 166 167 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 168 169 Arguments: 170 +action Unpack or Fetch 171 .opname Name of the Op, such as Add, Mult, LAND, etc. 172 .Type Type of the data 173 .BS Block size for vectorization 174 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 175 .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 176 .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 177 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 178 .CType Type with or without the const qualifier, i.e., const Type or Type 179 -Cvoid void with or without the const qualifier, i.e., const void or void 180 */ 181 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \ 182 static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 183 { \ 184 Type *u = (Type*)unpacked,*u2,t; \ 185 CType *p = (CType*)packed,*p2; \ 186 PetscInt i,j,k,l,r,step; \ 187 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 188 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 189 PetscFunctionBegin; \ 190 if (!idx) { \ 191 for (i=0; i<count; i++) \ 192 for (j=0; j<M; j++) \ 193 for (k=0; k<BS; k++) { \ 194 t = u[i*MBS+j*BS+k]; \ 195 APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 196 FILTER(p[i*MBS+j*BS+k] = t); \ 197 } \ 198 } else if (!opt) { /* No optimizations available */ \ 199 for (i=0; i<count; i++) \ 200 for (j=0; j<M; j++) \ 201 for (k=0; k<BS; k++) { \ 202 t = u[idx[i]*MBS+j*BS+k]; \ 203 APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 204 FILTER(p[i*MBS+j*BS+k] = t); \ 205 } \ 206 } else { \ 207 for (r=0; r<opt->n; r++) { \ 208 p2 = p + opt->offset[r]*MBS; \ 209 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 210 idx2 = idx + opt->offset[r]; \ 211 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 212 for (j=0; j<M; j++) \ 213 for (k=0; k<BS; k++) { \ 214 t = u[idx2[i]*MBS+j*BS+k]; \ 215 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 216 FILTER(p2[i*MBS+j*BS+k] = t); \ 217 } \ 218 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 219 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 220 u2 = u + idx[opt->copy_start[i]]*MBS; \ 221 l = opt->copy_length[i]*MBS; \ 222 for (j=0; j<l; j++) { \ 223 t = u2[j]; \ 224 APPLY (u2[j],t,op,p2[j]); \ 225 FILTER(p2[j] = t); \ 226 } \ 227 p2 += l; \ 228 } \ 229 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 230 u2 = u + idx[opt->offset[r]]*MBS; \ 231 step = opt->stride_step[r]; \ 232 for (i=0; i<opt->stride_n[r]; i++) \ 233 for (j=0; j<M; j++) \ 234 for (k=0; k<BS; k++) { \ 235 t = u2[i*step*MBS+j*BS+k]; \ 236 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 237 FILTER(p2[i*MBS+j*BS+k] = t); \ 238 } \ 239 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 240 } \ 241 } \ 242 PetscFunctionReturn(0); \ 243 } 244 245 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 246 247 Arguments: 248 +Action Unpack or Fetch 249 .locname Max or Min 250 .type1 Type of the first data in a pair type 251 .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 252 .op > or < 253 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 254 .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2) 255 -Cvoid void with or without the const qualifier, i.e., const void or void 256 */ 257 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \ 258 static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \ 259 PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \ 260 CType *p = (CType*)packed; \ 261 PetscInt i,j; \ 262 for (i=0; i<count; i++) { \ 263 j = idx? idx[i] : i; \ 264 FILTER(PairType(Type1,Type2) v = u[j]); \ 265 if (p[i].a op u[j].a) { \ 266 u[j] = p[i]; \ 267 } else if (p[i].a == u[j].a) { \ 268 u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 269 } \ 270 FILTER(p[i] = v); \ 271 } \ 272 PetscFunctionReturn(0); \ 273 } 274 275 /* Pack, Unpack/Fetch ops */ 276 #define DEF_Pack(Type,BS,EQ) \ 277 DEF_PackFunc(Type,BS,EQ) \ 278 DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \ 279 DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \ 280 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \ 281 link->Pack = CPPJoin4(Pack, Type,BS,EQ); \ 282 link->UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 283 link->FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \ 284 link->unitbytes = sizeof(Type); \ 285 } 286 287 /* Add, Mult ops */ 288 #define DEF_Add(Type,BS,EQ) \ 289 DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \ 290 DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \ 291 DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \ 292 DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \ 293 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \ 294 link->UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 295 link->UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 296 link->FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 297 link->FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \ 298 } 299 300 /* Max, Min ops */ 301 #define DEF_Cmp(Type,BS,EQ) \ 302 DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \ 303 DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \ 304 DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \ 305 DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \ 306 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \ 307 link->UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 308 link->UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 309 link->FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \ 310 link->FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \ 311 } 312 313 /* Logical ops. 314 The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 315 the compilation warning "empty macro arguments are undefined in ISO C90" 316 */ 317 #define DEF_Log(Type,BS,EQ) \ 318 DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \ 319 DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \ 320 DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \ 321 DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \ 322 DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \ 323 DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \ 324 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \ 325 link->UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \ 326 link->UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 327 link->UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \ 328 link->FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \ 329 link->FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \ 330 link->FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \ 331 } 332 333 /* Bitwise ops */ 334 #define DEF_Bit(Type,BS,EQ) \ 335 DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \ 336 DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \ 337 DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \ 338 DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \ 339 DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \ 340 DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \ 341 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \ 342 link->UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \ 343 link->UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 344 link->UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \ 345 link->FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \ 346 link->FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \ 347 link->FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \ 348 } 349 350 /* Maxloc, Minloc */ 351 #define DEF_Xloc(Type1,Type2) \ 352 DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \ 353 DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \ 354 DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \ 355 DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \ 356 static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \ 357 link->UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \ 358 link->UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \ 359 link->FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \ 360 link->FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \ 361 } 362 363 #define DEF_IntegerType(Type,BS,EQ) \ 364 DEF_Pack(Type,BS,EQ) \ 365 DEF_Add(Type,BS,EQ) \ 366 DEF_Cmp(Type,BS,EQ) \ 367 DEF_Log(Type,BS,EQ) \ 368 DEF_Bit(Type,BS,EQ) \ 369 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \ 370 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 371 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 372 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 373 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 375 } 376 377 #define DEF_RealType(Type,BS,EQ) \ 378 DEF_Pack(Type,BS,EQ) \ 379 DEF_Add(Type,BS,EQ) \ 380 DEF_Cmp(Type,BS,EQ) \ 381 DEF_Log(Type,BS,EQ) \ 382 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \ 383 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 384 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 385 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 386 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 387 } 388 389 #if defined(PETSC_HAVE_COMPLEX) 390 #define DEF_ComplexType(Type,BS,EQ) \ 391 DEF_Pack(Type,BS,EQ) \ 392 DEF_Add(Type,BS,EQ) \ 393 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \ 394 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 395 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 396 } 397 #endif 398 399 #define DEF_DumbType(Type,BS,EQ) \ 400 DEF_Pack(Type,BS,EQ) \ 401 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \ 402 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 403 } 404 405 /* Maxloc, Minloc */ 406 #define DEF_PairType(Type1,Type2) \ 407 typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \ 408 DEF_Pack(PairType(Type1,Type2),1,1) \ 409 DEF_Xloc(Type1,Type2) \ 410 static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \ 411 CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \ 412 CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \ 413 } 414 415 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 416 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 417 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 418 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 419 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 420 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 421 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 422 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 423 424 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 425 DEF_IntegerType(int,1,1) 426 DEF_IntegerType(int,2,1) 427 DEF_IntegerType(int,4,1) 428 DEF_IntegerType(int,8,1) 429 DEF_IntegerType(int,1,0) 430 DEF_IntegerType(int,2,0) 431 DEF_IntegerType(int,4,0) 432 DEF_IntegerType(int,8,0) 433 #endif 434 435 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 436 typedef signed char SignedChar; 437 DEF_IntegerType(SignedChar,1,1) 438 DEF_IntegerType(SignedChar,2,1) 439 DEF_IntegerType(SignedChar,4,1) 440 DEF_IntegerType(SignedChar,8,1) 441 DEF_IntegerType(SignedChar,1,0) 442 DEF_IntegerType(SignedChar,2,0) 443 DEF_IntegerType(SignedChar,4,0) 444 DEF_IntegerType(SignedChar,8,0) 445 446 typedef unsigned char UnsignedChar; 447 DEF_IntegerType(UnsignedChar,1,1) 448 DEF_IntegerType(UnsignedChar,2,1) 449 DEF_IntegerType(UnsignedChar,4,1) 450 DEF_IntegerType(UnsignedChar,8,1) 451 DEF_IntegerType(UnsignedChar,1,0) 452 DEF_IntegerType(UnsignedChar,2,0) 453 DEF_IntegerType(UnsignedChar,4,0) 454 DEF_IntegerType(UnsignedChar,8,0) 455 456 DEF_RealType(PetscReal,1,1) 457 DEF_RealType(PetscReal,2,1) 458 DEF_RealType(PetscReal,4,1) 459 DEF_RealType(PetscReal,8,1) 460 DEF_RealType(PetscReal,1,0) 461 DEF_RealType(PetscReal,2,0) 462 DEF_RealType(PetscReal,4,0) 463 DEF_RealType(PetscReal,8,0) 464 465 #if defined(PETSC_HAVE_COMPLEX) 466 DEF_ComplexType(PetscComplex,1,1) 467 DEF_ComplexType(PetscComplex,2,1) 468 DEF_ComplexType(PetscComplex,4,1) 469 DEF_ComplexType(PetscComplex,8,1) 470 DEF_ComplexType(PetscComplex,1,0) 471 DEF_ComplexType(PetscComplex,2,0) 472 DEF_ComplexType(PetscComplex,4,0) 473 DEF_ComplexType(PetscComplex,8,0) 474 #endif 475 476 DEF_PairType(int,int) 477 DEF_PairType(PetscInt,PetscInt) 478 479 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 480 DEF_DumbType(char,1,1) 481 DEF_DumbType(char,2,1) 482 DEF_DumbType(char,4,1) 483 DEF_DumbType(char,1,0) 484 DEF_DumbType(char,2,0) 485 DEF_DumbType(char,4,0) 486 487 typedef int DumbInt; /* To differentiate with regular int used above */ 488 DEF_DumbType(DumbInt,1,1) 489 DEF_DumbType(DumbInt,2,1) 490 DEF_DumbType(DumbInt,4,1) 491 DEF_DumbType(DumbInt,8,1) 492 DEF_DumbType(DumbInt,1,0) 493 DEF_DumbType(DumbInt,2,0) 494 DEF_DumbType(DumbInt,4,0) 495 DEF_DumbType(DumbInt,8,0) 496 497 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 498 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 499 { 500 int ierr; 501 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 502 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 503 return MPI_SUCCESS; 504 } 505 #endif 506 507 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink) 508 { 509 PetscErrorCode ierr; 510 PetscSFPack link,*p; 511 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 512 513 PetscFunctionBegin; 514 /* Look for types in cache */ 515 for (p=&bas->inuse; (link=*p); p=&link->next) { 516 PetscBool match; 517 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 518 if (match && (rkey == link->rkey) && (lkey == link->lkey)) { 519 switch (cmode) { 520 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 521 case PETSC_USE_POINTER: break; 522 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 523 } 524 *mylink = link; 525 PetscFunctionReturn(0); 526 } 527 } 528 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 529 PetscFunctionReturn(0); 530 } 531 532 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 533 { 534 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 535 536 PetscFunctionBegin; 537 (*link)->rkey = NULL; 538 (*link)->lkey = NULL; 539 (*link)->next = bas->avail; 540 bas->avail = *link; 541 *link = NULL; 542 PetscFunctionReturn(0); 543 } 544 545 /* Error out on unsupported overlapped communications */ 546 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey) 547 { 548 PetscErrorCode ierr; 549 PetscSFPack link,*p; 550 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 551 PetscBool match; 552 553 PetscFunctionBegin; 554 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 555 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 556 */ 557 if (rkey || lkey) { 558 for (p=&bas->inuse; (link=*p); p=&link->next) { 559 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 560 if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey); 561 } 562 } 563 PetscFunctionReturn(0); 564 } 565 566 PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit) 567 { 568 PetscErrorCode ierr; 569 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 570 PetscBool is2Int,is2PetscInt; 571 PetscMPIInt ni,na,nd,combiner; 572 #if defined(PETSC_HAVE_COMPLEX) 573 PetscInt nPetscComplex=0; 574 #endif 575 576 PetscFunctionBegin; 577 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 578 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 579 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 580 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 581 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 582 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 583 #if defined(PETSC_HAVE_COMPLEX) 584 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 585 #endif 586 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 587 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 588 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 589 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 590 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; 591 link->bs = 1; /* default */ 592 593 if (nPetscReal) { 594 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 595 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 596 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 597 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 598 link->bs = nPetscReal; 599 link->unitbytes *= nPetscReal; 600 link->basicunit = MPIU_REAL; 601 } else if (nPetscInt) { 602 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 603 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 604 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 605 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 606 link->bs = nPetscInt; 607 link->unitbytes *= nPetscInt; 608 link->basicunit = MPIU_INT; 609 #if defined(PETSC_USE_64BIT_INDICES) 610 } else if (nInt) { 611 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 612 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 613 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 614 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 615 link->bs = nInt; 616 link->unitbytes *= nInt; 617 link->basicunit = MPI_INT; 618 #endif 619 } else if (nSignedChar) { 620 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 621 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 622 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 623 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 624 link->bs = nSignedChar; 625 link->unitbytes *= nSignedChar; 626 link->basicunit = MPI_SIGNED_CHAR; 627 } else if (nUnsignedChar) { 628 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 629 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 630 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 631 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 632 link->bs = nUnsignedChar; 633 link->unitbytes *= nUnsignedChar; 634 link->basicunit = MPI_UNSIGNED_CHAR; 635 #if defined(PETSC_HAVE_COMPLEX) 636 } else if (nPetscComplex) { 637 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 638 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 639 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 640 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 641 link->bs = nPetscComplex; 642 link->unitbytes *= nPetscComplex; 643 link->basicunit = MPIU_COMPLEX; 644 #endif 645 } else if (is2Int) { 646 PackInit_PairType_int_int(link); 647 link->bs = 1; 648 link->basicunit = MPI_2INT; 649 } else if (is2PetscInt) { 650 PackInit_PairType_PetscInt_PetscInt(link); 651 link->bs = 1; 652 link->basicunit = MPIU_2INT; 653 } else { 654 MPI_Aint lb,nbyte; 655 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 656 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 657 if (nbyte % sizeof(DumbInt)) { /* If the type size is not multiple of int */ 658 if (nbyte == 4) {PackInit_DumbType_char_4_1(link); link->bs = nbyte/4;} else if (nbyte%4 == 0) {PackInit_DumbType_char_4_0(link); link->bs = nbyte/4;} /* Note the basic type is char[4] */ 659 else if (nbyte == 2) {PackInit_DumbType_char_2_1(link); link->bs = nbyte/2;} else if (nbyte%2 == 0) {PackInit_DumbType_char_2_0(link); link->bs = nbyte/2;} 660 else if (nbyte == 1) {PackInit_DumbType_char_1_1(link); link->bs = nbyte/1;} else if (nbyte%1 == 0) {PackInit_DumbType_char_1_0(link); link->bs = nbyte/1;} 661 link->unitbytes = nbyte; 662 link->basicunit = MPI_BYTE; 663 } else { 664 nInt = nbyte / sizeof(DumbInt); 665 if (nInt == 8) {PackInit_DumbType_DumbInt_8_1(link); link->bs = nInt/8;} else if (nInt%8 == 0) {PackInit_DumbType_DumbInt_8_0(link); link->bs = nInt/8;} 666 else if (nInt == 4) {PackInit_DumbType_DumbInt_4_1(link); link->bs = nInt/4;} else if (nInt%4 == 0) {PackInit_DumbType_DumbInt_4_0(link); link->bs = nInt/4;} 667 else if (nInt == 2) {PackInit_DumbType_DumbInt_2_1(link); link->bs = nInt/2;} else if (nInt%2 == 0) {PackInit_DumbType_DumbInt_2_0(link); link->bs = nInt/2;} 668 else if (nInt == 1) {PackInit_DumbType_DumbInt_1_1(link); link->bs = nInt/1;} else if (nInt%1 == 0) {PackInit_DumbType_DumbInt_1_0(link); link->bs = nInt/1;} 669 link->unitbytes = nbyte; 670 link->basicunit = MPI_INT; 671 } 672 } 673 674 if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */ 675 else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 676 PetscFunctionReturn(0); 677 } 678 679 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*)) 680 { 681 PetscFunctionBegin; 682 *UnpackAndOp = NULL; 683 if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert; 684 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd; 685 else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult; 686 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax; 687 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin; 688 else if (op == MPI_LAND) *UnpackAndOp = link->UnpackAndLAND; 689 else if (op == MPI_BAND) *UnpackAndOp = link->UnpackAndBAND; 690 else if (op == MPI_LOR) *UnpackAndOp = link->UnpackAndLOR; 691 else if (op == MPI_BOR) *UnpackAndOp = link->UnpackAndBOR; 692 else if (op == MPI_LXOR) *UnpackAndOp = link->UnpackAndLXOR; 693 else if (op == MPI_BXOR) *UnpackAndOp = link->UnpackAndBXOR; 694 else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc; 695 else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc; 696 else *UnpackAndOp = NULL; 697 PetscFunctionReturn(0); 698 } 699 700 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*)) 701 { 702 PetscFunctionBegin; 703 *FetchAndOp = NULL; 704 if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert; 705 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd; 706 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax; 707 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin; 708 else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc; 709 else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc; 710 else if (op == MPI_PROD) *FetchAndOp = link->FetchAndMult; 711 else if (op == MPI_LAND) *FetchAndOp = link->FetchAndLAND; 712 else if (op == MPI_BAND) *FetchAndOp = link->FetchAndBAND; 713 else if (op == MPI_LOR) *FetchAndOp = link->FetchAndLOR; 714 else if (op == MPI_BOR) *FetchAndOp = link->FetchAndBOR; 715 else if (op == MPI_LXOR) *FetchAndOp = link->FetchAndLXOR; 716 else if (op == MPI_BXOR) *FetchAndOp = link->FetchAndBXOR; 717 else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op"); 718 PetscFunctionReturn(0); 719 } 720 721 /* 722 Setup pack/unpack optimization plans based on indice patterns available 723 724 Input Parameters: 725 + n - Number of target ranks 726 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] is not necessarily 0. 727 - idx - [*] Array storing indices 728 729 Output Parameters: 730 + opt - the optimization 731 */ 732 PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 733 { 734 PetscErrorCode ierr; 735 PetscInt i,j,k,n_copies,tot_copies=0,step; 736 PetscBool strided,optimized=PETSC_FALSE; 737 PetscSFPackOpt opt; 738 739 PetscFunctionBegin; 740 if (!n) { 741 *out = NULL; 742 PetscFunctionReturn(0); 743 } 744 745 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 746 ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 747 ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 748 if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/ 749 750 opt->n = n; 751 752 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 753 for (i=0; i<n; i++) { /* for each target processor */ 754 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 755 n_copies = 1; 756 for (j=offset[i]; j<offset[i+1]-1; j++) { 757 if (idx[j]+1 != idx[j+1]) n_copies++; 758 } 759 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 760 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 761 */ 762 if ((offset[i+1]-offset[i])/n_copies >= 32) { 763 opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 764 optimized = PETSC_TRUE; 765 tot_copies += n_copies; 766 } 767 } 768 769 /* Setup memcpy plan for each contiguous piece */ 770 k = 0; /* k-th copy */ 771 ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 772 for (i=0; i<n; i++) { /* for each target processor */ 773 if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 774 n_copies = 1; 775 opt->copy_start[k] = offset[i] - offset[0]; 776 for (j=offset[i]; j<offset[i+1]-1; j++) { 777 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 778 n_copies++; 779 opt->copy_start[k+1] = j-offset[0]+1; 780 opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 781 k++; 782 } 783 } 784 /* Set copy length of the last copy for this target */ 785 opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 786 k++; 787 } 788 /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 789 opt->copy_offset[i+1] = k; 790 } 791 792 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 793 for (i=0; i<n; i++) { /* for each remote */ 794 if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 795 strided = PETSC_TRUE; 796 step = idx[offset[i]+1] - idx[offset[i]]; 797 for (j=offset[i]; j<offset[i+1]-1; j++) { 798 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 799 } 800 if (strided) { 801 opt->type[i] = PETSCSF_PACKOPT_STRIDE; 802 opt->stride_step[i] = step; 803 opt->stride_n[i] = offset[i+1] - offset[i]; 804 optimized = PETSC_TRUE; 805 } 806 } 807 } 808 /* If no rank gets optimized, free arrays to save memory */ 809 if (!optimized) { 810 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 811 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 812 ierr = PetscFree(opt);CHKERRQ(ierr); 813 *out = NULL; 814 } else *out = opt; 815 PetscFunctionReturn(0); 816 } 817 818 PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out) 819 { 820 PetscErrorCode ierr; 821 PetscSFPackOpt opt = *out; 822 823 PetscFunctionBegin; 824 if (opt) { 825 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 826 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 827 ierr = PetscFree(opt);CHKERRQ(ierr); 828 *out = NULL; 829 } 830 PetscFunctionReturn(0); 831 } 832