1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 /* 6 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 7 * therefore we pack data types manually. This file defines packing routines for the standard data types. 8 */ 9 10 #define CPPJoin2_exp(a,b) a ## b 11 #define CPPJoin2(a,b) CPPJoin2_exp(a,b) 12 #define CPPJoin3_exp_(a,b,c) a ## b ## _ ## c 13 #define CPPJoin3_(a,b,c) CPPJoin3_exp_(a,b,c) 14 15 #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 16 #define IGNORE(statement) do {} while(0) 17 18 #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 19 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 20 #define LXOR_OP(r,s,op,t) do {(r) = (!s) != (!t);} while(0) /* logical exclusive OR */ 21 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 22 23 #define BlockType(type,count) CPPJoin3_(_blocktype_,type,count) /* typename for struct {type v[count];} */ 24 #define PairType(type1,type2) CPPJoin3_(_pairtype_,type1,type2) /* typename for struct {type1 a; type2 b;} */ 25 26 /* DEF_PackFunc - macro defining a Pack routine 27 28 Arguments of the macro: 29 +type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30 -BS Block size for vectorization. It is a factor of bs. 31 32 Arguments of the Pack routine: 33 +n Number of entries to pack. Each entry is of type 'unit'. Here the unit is the arg used in calls like PetscSFBcastBegin(sf,unit,..). 34 If idx in not NULL, then n also indicates the number of indices in idx[] 35 .bs Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int 36 .idx Indices of entries. NULL means contiguous indices [0,n) 37 .r Do packing for the r-th target processor 38 .opt Pack optimization plans. NULL means no plan. 39 .unpacked Address of the unpacked data 40 -packed Address of the packed data 41 */ 42 #define DEF_PackFunc(type,BS) \ 43 static PetscErrorCode CPPJoin3_(Pack_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,const void *unpacked,void *packed) { \ 44 PetscErrorCode ierr; \ 45 const type *u = (const type*)unpacked,*u2; \ 46 type *p = (type*)packed; \ 47 PetscInt i,j,k,l,step; \ 48 PetscFunctionBegin; \ 49 if (!idx) { /* idx[] is contiguous */ \ 50 ierr = PetscArraycpy(p,u,bs*n);CHKERRQ(ierr); \ 51 } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 52 for (i=0; i<n; i++) \ 53 for (j=0; j<bs; j+=BS) \ 54 for (k=j; k<j+BS; k++) \ 55 p[i*bs+k] = u[idx[i]*bs+k]; \ 56 } else { /* idx[] is optimized*/ \ 57 if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 58 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 59 l = opt->copy_length[i]*bs; /* length in types */ \ 60 u2 = u + opt->copy_start[i]*bs; \ 61 ierr = PetscArraycpy(p,u2,l);CHKERRQ(ierr); \ 62 p += l; \ 63 } \ 64 } else { /* idx[] is strided */ \ 65 u += opt->stride_first[r]*bs; \ 66 step = opt->stride_step[r]; \ 67 for (i=0; i<opt->stride_n[r]; i++) \ 68 for (j=0; j<bs; j++) \ 69 p[i*bs+j] = u[i*step*bs+j]; \ 70 } \ 71 } \ 72 PetscFunctionReturn(0); \ 73 } 74 75 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 76 77 Arguments: 78 +action Unpack or Fetch 79 .type Type of the data 80 .BS Block size for vectorization 81 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 82 .ctype Type with or without the const qualifier, i.e., const type or type 83 .cvoid void with or without the const qualifier, i.e., const void or void 84 85 Notes: 86 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 87 The two arguments ctype and cvoid are used (instead of one constness argument), because we want to 88 get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 89 sometimes we input 'const', sometimes we have to input empty. 90 */ 91 #define DEF_Action(action,type,BS,FILTER,ctype,cvoid) \ 92 static PetscErrorCode CPPJoin3_(action##AndInsert_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 93 PetscErrorCode ierr; \ 94 type *u = (type*)unpacked,*u2; \ 95 ctype *p = (ctype*)packed; \ 96 PetscInt i,j,k,l,step; \ 97 PetscFunctionBegin; \ 98 if (!idx) { /* idx[] is contiguous */ \ 99 FILTER(type *v); \ 100 FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); \ 101 FILTER(ierr = PetscArraycpy(v,u,bs*n);CHKERRQ(ierr)); \ 102 ierr = PetscArraycpy(u,p,bs*n);CHKERRQ(ierr); \ 103 FILTER(ierr = PetscArraycpy(p,v,bs*n);CHKERRQ(ierr)); \ 104 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 105 } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 106 for (i=0; i<n; i++) { \ 107 for (j=0; j<bs; j+=BS) { \ 108 for (k=j; k<j+BS; k++) { \ 109 FILTER(type t = u[idx[i]*bs+k]); \ 110 u[idx[i]*bs+k] = p[i*bs+k]; \ 111 FILTER(p[i*bs+k] = t); \ 112 } \ 113 } \ 114 } \ 115 } else { /* idx[] is optimized*/ \ 116 if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 117 FILTER(type *v); \ 118 FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); /* maximal buffer */ \ 119 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 120 l = opt->copy_length[i]*bs; /* length in types */ \ 121 u2 = u + opt->copy_start[i]*bs; \ 122 FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 123 ierr = PetscArraycpy(u2,p,l);CHKERRQ(ierr); \ 124 FILTER(ierr = PetscArraycpy(p,v,l);CHKERRQ(ierr)); \ 125 p += l; \ 126 } \ 127 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 128 } else { /* idx[] is strided */ \ 129 u += opt->stride_first[r]*bs; \ 130 step = opt->stride_step[r]; \ 131 for (i=0; i<opt->stride_n[r]; i++) \ 132 for (j=0; j<bs; j++) { \ 133 FILTER(type t = u[i*step*bs+j]); \ 134 u[i*step*bs+j] = p[i*bs+j]; \ 135 FILTER(p[i*bs+j] = t); \ 136 } \ 137 } \ 138 } \ 139 PetscFunctionReturn(0); \ 140 } 141 142 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 143 144 Arguments: 145 +action Unpack or Fetch 146 .opname Name of the Op, such as Add, Mult, LAND, etc. 147 .type Type of the data 148 .BS Block size for vectorization 149 .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 150 .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 151 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 152 .ctype Type with or without the const qualifier, i.e., const type or type 153 -cvoid void with or without the const qualifier, i.e., const void or void 154 */ 155 #define DEF_ActionAndOp(action,opname,type,BS,op,APPLY,FILTER,ctype,cvoid) \ 156 static PetscErrorCode CPPJoin3_(action##And##opname##_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 157 type *u = (type*)unpacked,*u2,t; \ 158 ctype *p = (ctype*)packed; \ 159 PetscInt i,j,k,l,step; \ 160 PetscFunctionBegin; \ 161 if (!idx) { /* idx[] is contiguous */ \ 162 for (i=0; i<n*bs; i++) { \ 163 t = u[i]; \ 164 APPLY(u[i],t,op,p[i]); \ 165 FILTER(p[i] = t); \ 166 } \ 167 } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 168 for (i=0; i<n; i++) { \ 169 for (j=0; j<bs; j+=BS) { \ 170 for (k=j; k<j+BS; k++) { \ 171 t = u[idx[i]*bs+k]; \ 172 APPLY(u[idx[i]*bs+k],t,op,p[i*bs+k]); \ 173 FILTER(p[i*bs+k] = t); \ 174 } \ 175 } \ 176 } \ 177 } else { /* idx[] is optimized*/ \ 178 if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 179 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 180 l = opt->copy_length[i]*bs; /* length in types */ \ 181 u2 = u + opt->copy_start[i]*bs; \ 182 for (j=0; j<l; j++) { \ 183 t = u2[j]; \ 184 APPLY(u2[j],t,op,p[j]); \ 185 FILTER(p[j] = t); \ 186 } \ 187 p += l; \ 188 } \ 189 } else { /* idx[] is strided */ \ 190 u += opt->stride_first[r]*bs; \ 191 step = opt->stride_step[r]; \ 192 for (i=0; i<opt->stride_n[r]; i++) \ 193 for (j=0; j<bs; j++) { \ 194 t = u[i*step*bs+j]; \ 195 APPLY(u[i*step*bs+j],t,op,p[i*bs+j]); \ 196 FILTER(p[i*bs+j] = t); \ 197 } \ 198 } \ 199 } \ 200 PetscFunctionReturn(0); \ 201 } 202 203 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 204 205 Arguments: 206 +Action Unpack or Fetch 207 .locname Max or Min 208 .type1 Type of the first data in a pair type 209 .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 210 .op > or < 211 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 212 .ctype Type with or without the const qualifier, i.e., const PairType(type1,type2) or PairType(type1,type2) 213 -cvoid void with or without the const qualifier, i.e., const void or void 214 */ 215 #define DEF_ActionAndXloc(action,locname,type1,type2,op,FILTER,ctype,cvoid) \ 216 static PetscErrorCode CPPJoin3_(action##And##locname##loc_,PairType(type1,type2),1)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 217 PairType(type1,type2) *u = (PairType(type1,type2)*)unpacked; \ 218 ctype *p = (ctype*)packed; \ 219 PetscInt i; \ 220 for (i=0; i<n; i++) { \ 221 PetscInt j = idx[i]; \ 222 FILTER(PairType(type1,type2) v = u[j]); \ 223 if (p[i].a op u[j].a) { \ 224 u[j] = p[i]; \ 225 } else if (p[i].a == u[j].a) { \ 226 u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 227 } \ 228 FILTER(p[i] = v); \ 229 } \ 230 PetscFunctionReturn(0); \ 231 } 232 233 234 /* Pack/unpack/fetch ops for all types */ 235 #define DEF_PackNoInit(type,BS) \ 236 DEF_PackFunc(type,BS) \ 237 DEF_Action(Unpack,type,BS,IGNORE,const type,const void) \ 238 DEF_Action(Fetch, type,BS,EXECUTE,type,void) \ 239 240 241 /* Extra addition ops for types supporting them */ 242 #define DEF_PackAddNoInit(type,BS) \ 243 DEF_PackNoInit(type,BS) \ 244 DEF_ActionAndOp(Unpack,Add, type,BS,+,BINARY_OP,IGNORE,const type,const void) \ 245 DEF_ActionAndOp(Unpack,Mult,type,BS,*,BINARY_OP,IGNORE,const type,const void) \ 246 DEF_ActionAndOp(Fetch, Add, type,BS,+,BINARY_OP,EXECUTE,type,void) \ 247 DEF_ActionAndOp(Fetch, Mult,type,BS,*,BINARY_OP,EXECUTE,type,void) 248 249 /* Basic types */ 250 #define DEF_Pack(type,BS) \ 251 DEF_PackAddNoInit(type,BS) \ 252 static void CPPJoin3_(PackInit_,type,BS)(PetscSFPack link) { \ 253 link->Pack = CPPJoin3_(Pack_, type,BS); \ 254 link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,BS); \ 255 link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, type,BS); \ 256 link->UnpackAndMult = CPPJoin3_(UnpackAndMult_, type,BS); \ 257 link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, type,BS); \ 258 link->FetchAndAdd = CPPJoin3_(FetchAndAdd_, type,BS); \ 259 link->FetchAndMult = CPPJoin3_(FetchAndMult_, type,BS); \ 260 link->unitbytes = sizeof(type); \ 261 } 262 263 /* Comparable types */ 264 #define DEF_PackCmp(type) \ 265 DEF_PackAddNoInit(type,1) \ 266 DEF_ActionAndOp(Unpack,Max,type,1,PetscMax,FUNCTION_OP,IGNORE,const type,const void) \ 267 DEF_ActionAndOp(Unpack,Min,type,1,PetscMin,FUNCTION_OP,IGNORE,const type,const void) \ 268 DEF_ActionAndOp(Fetch, Max,type,1,PetscMax,FUNCTION_OP,EXECUTE,type,void) \ 269 DEF_ActionAndOp(Fetch, Min,type,1,PetscMin,FUNCTION_OP,EXECUTE,type,void) \ 270 static void CPPJoin2(PackInit_,type)(PetscSFPack link) { \ 271 link->Pack = CPPJoin3_(Pack_, type,1); \ 272 link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,1); \ 273 link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, type,1); \ 274 link->UnpackAndMult = CPPJoin3_(UnpackAndMult_, type,1); \ 275 link->UnpackAndMax = CPPJoin3_(UnpackAndMax_, type,1); \ 276 link->UnpackAndMin = CPPJoin3_(UnpackAndMin_, type,1); \ 277 link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, type,1); \ 278 link->FetchAndAdd = CPPJoin3_(FetchAndAdd_ , type,1); \ 279 link->FetchAndMult = CPPJoin3_(FetchAndMult_, type,1); \ 280 link->FetchAndMax = CPPJoin3_(FetchAndMax_ , type,1); \ 281 link->FetchAndMin = CPPJoin3_(FetchAndMin_ , type,1); \ 282 link->unitbytes = sizeof(type); \ 283 } 284 285 /* Logical Types */ 286 /* The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 287 the compilation warning "empty macro arguments are undefined in ISO C90" 288 */ 289 #define DEF_PackLog(type) \ 290 DEF_ActionAndOp(Unpack,LAND,type,1,&&,BINARY_OP,IGNORE,const type,const void) \ 291 DEF_ActionAndOp(Unpack,LOR, type,1,||,BINARY_OP,IGNORE,const type,const void) \ 292 DEF_ActionAndOp(Unpack,LXOR,type,1,&, LXOR_OP, IGNORE,const type,const void) \ 293 DEF_ActionAndOp(Fetch, LAND,type,1,&&,BINARY_OP,EXECUTE,type,void) \ 294 DEF_ActionAndOp(Fetch, LOR, type,1,||,BINARY_OP,EXECUTE,type,void) \ 295 DEF_ActionAndOp(Fetch, LXOR,type,1,&, LXOR_OP, EXECUTE,type,void) \ 296 static void CPPJoin2(PackInit_Logical_,type)(PetscSFPack link) { \ 297 link->UnpackAndLAND = CPPJoin3_(UnpackAndLAND_,type,1); \ 298 link->UnpackAndLOR = CPPJoin3_(UnpackAndLOR_, type,1); \ 299 link->UnpackAndLXOR = CPPJoin3_(UnpackAndLXOR_,type,1); \ 300 link->FetchAndLAND = CPPJoin3_(FetchAndLAND_, type,1); \ 301 link->FetchAndLOR = CPPJoin3_(FetchAndLOR_, type,1); \ 302 link->FetchAndLXOR = CPPJoin3_(FetchAndLXOR_, type,1); \ 303 } 304 305 306 /* Bitwise Types */ 307 #define DEF_PackBit(type) \ 308 DEF_ActionAndOp(Unpack,BAND,type,1,&,BINARY_OP,IGNORE,const type,const void) \ 309 DEF_ActionAndOp(Unpack,BOR, type,1,|,BINARY_OP,IGNORE,const type,const void) \ 310 DEF_ActionAndOp(Unpack,BXOR,type,1,^,BINARY_OP,IGNORE,const type,const void) \ 311 DEF_ActionAndOp(Fetch, BAND,type,1,&,BINARY_OP,EXECUTE,type,void) \ 312 DEF_ActionAndOp(Fetch, BOR, type,1,|,BINARY_OP,EXECUTE,type,void) \ 313 DEF_ActionAndOp(Fetch, BXOR,type,1,^,BINARY_OP,EXECUTE,type,void) \ 314 static void CPPJoin2(PackInit_Bitwise_,type)(PetscSFPack link) { \ 315 link->UnpackAndBAND = CPPJoin3_(UnpackAndBAND_,type,1); \ 316 link->UnpackAndBOR = CPPJoin3_(UnpackAndBOR_, type,1); \ 317 link->UnpackAndBXOR = CPPJoin3_(UnpackAndBXOR_,type,1); \ 318 link->FetchAndBAND = CPPJoin3_(FetchAndBAND_, type,1); \ 319 link->FetchAndBOR = CPPJoin3_(FetchAndBOR_, type,1); \ 320 link->FetchAndBXOR = CPPJoin3_(FetchAndBXOR_, type,1); \ 321 } 322 323 324 /* Pair types */ 325 #define DEF_PackPair(type1,type2) \ 326 typedef struct {type1 a; type2 b;} PairType(type1,type2); \ 327 DEF_PackFunc(PairType(type1,type2),1) \ 328 DEF_Action(Unpack,PairType(type1,type2),1,IGNORE,const PairType(type1,type2),const void) \ 329 DEF_Action(Fetch, PairType(type1,type2),1,EXECUTE,PairType(type1,type2),void) \ 330 DEF_ActionAndOp(Unpack,Add,PairType(type1,type2),1,+,PAIRTYPE_OP,IGNORE,const PairType(type1,type2),const void) \ 331 DEF_ActionAndOp(Fetch, Add,PairType(type1,type2),1,+,PAIRTYPE_OP,EXECUTE,PairType(type1,type2),void) \ 332 DEF_ActionAndXloc(Unpack,Max,type1,type2,>,IGNORE,const PairType(type1,type2),const void) \ 333 DEF_ActionAndXloc(Unpack,Min,type1,type2,<,IGNORE,const PairType(type1,type2),const void) \ 334 DEF_ActionAndXloc(Fetch, Max,type1,type2,>,EXECUTE,PairType(type1,type2),void) \ 335 DEF_ActionAndXloc(Fetch, Min,type1,type2,<,EXECUTE,PairType(type1,type2),void) \ 336 static void CPPJoin3_(PackInit_,type1,type2)(PetscSFPack link) { \ 337 link->Pack = CPPJoin3_(Pack_, PairType(type1,type2),1); \ 338 link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,PairType(type1,type2),1); \ 339 link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, PairType(type1,type2),1); \ 340 link->UnpackAndMaxloc = CPPJoin3_(UnpackAndMaxloc_,PairType(type1,type2),1); \ 341 link->UnpackAndMinloc = CPPJoin3_(UnpackAndMinloc_,PairType(type1,type2),1); \ 342 link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, PairType(type1,type2),1); \ 343 link->FetchAndAdd = CPPJoin3_(FetchAndAdd_, PairType(type1,type2),1); \ 344 link->FetchAndMaxloc = CPPJoin3_(FetchAndMaxloc_, PairType(type1,type2),1); \ 345 link->FetchAndMinloc = CPPJoin3_(FetchAndMinloc_, PairType(type1,type2),1); \ 346 link->unitbytes = sizeof(PairType(type1,type2)); \ 347 } 348 349 350 /* Currently only dumb blocks of data */ 351 #define DEF_Block(type,count) \ 352 typedef struct {type v[count];} BlockType(type,count); \ 353 DEF_PackNoInit(BlockType(type,count),1) \ 354 static void CPPJoin3_(PackInit_block_,type,count)(PetscSFPack link) { \ 355 link->Pack = CPPJoin3_(Pack_, BlockType(type,count),1); \ 356 link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,BlockType(type,count),1); \ 357 link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, BlockType(type,count),1); \ 358 link->unitbytes = sizeof(BlockType(type,count)); \ 359 } 360 361 /* The typedef is used to get a typename without space that CPPJoin can handle */ 362 typedef signed char SignedChar; 363 typedef unsigned char UnsignedChar; 364 365 DEF_PackCmp(SignedChar) 366 DEF_PackBit(SignedChar) 367 DEF_PackLog(SignedChar) 368 DEF_PackCmp(UnsignedChar) 369 DEF_PackBit(UnsignedChar) 370 DEF_PackLog(UnsignedChar) 371 DEF_PackCmp(int) 372 DEF_PackBit(int) 373 DEF_PackLog(int) 374 DEF_PackCmp(PetscInt) 375 DEF_PackBit(PetscInt) 376 DEF_PackLog(PetscInt) 377 DEF_Pack(PetscInt,2) 378 DEF_Pack(PetscInt,3) 379 DEF_Pack(PetscInt,4) 380 DEF_Pack(PetscInt,5) 381 DEF_Pack(PetscInt,7) 382 DEF_PackCmp(PetscReal) 383 DEF_PackLog(PetscReal) 384 DEF_Pack(PetscReal,2) 385 DEF_Pack(PetscReal,3) 386 DEF_Pack(PetscReal,4) 387 DEF_Pack(PetscReal,5) 388 DEF_Pack(PetscReal,7) 389 #if defined(PETSC_HAVE_COMPLEX) 390 DEF_Pack(PetscComplex,1) 391 DEF_Pack(PetscComplex,2) 392 DEF_Pack(PetscComplex,3) 393 DEF_Pack(PetscComplex,4) 394 DEF_Pack(PetscComplex,5) 395 DEF_Pack(PetscComplex,7) 396 #endif 397 DEF_PackPair(int,int) 398 DEF_PackPair(PetscInt,PetscInt) 399 DEF_Block(int,1) 400 DEF_Block(int,2) 401 DEF_Block(int,4) 402 DEF_Block(int,8) 403 DEF_Block(char,1) 404 DEF_Block(char,2) 405 DEF_Block(char,4) 406 407 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 408 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 409 { 410 int ierr; 411 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 412 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 413 return MPI_SUCCESS; 414 } 415 #endif 416 417 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink) 418 { 419 PetscErrorCode ierr; 420 PetscSFPack link,*p; 421 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 422 423 PetscFunctionBegin; 424 /* Look for types in cache */ 425 for (p=&bas->inuse; (link=*p); p=&link->next) { 426 PetscBool match; 427 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 428 if (match && (rkey == link->rkey) && (lkey == link->lkey)) { 429 switch (cmode) { 430 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 431 case PETSC_USE_POINTER: break; 432 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 433 } 434 *mylink = link; 435 PetscFunctionReturn(0); 436 } 437 } 438 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 439 PetscFunctionReturn(0); 440 } 441 442 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 443 { 444 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 445 446 PetscFunctionBegin; 447 (*link)->rkey = NULL; 448 (*link)->lkey = NULL; 449 (*link)->next = bas->avail; 450 bas->avail = *link; 451 *link = NULL; 452 PetscFunctionReturn(0); 453 } 454 455 /* Error out on unsupported overlapped communications */ 456 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey) 457 { 458 PetscErrorCode ierr; 459 PetscSFPack link,*p; 460 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 461 PetscBool match; 462 463 PetscFunctionBegin; 464 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 465 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 466 */ 467 if (rkey || lkey) { 468 for (p=&bas->inuse; (link=*p); p=&link->next) { 469 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 470 if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey); 471 } 472 } 473 PetscFunctionReturn(0); 474 } 475 476 PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit) 477 { 478 PetscErrorCode ierr; 479 PetscBool isInt,isPetscInt,isPetscReal,is2Int,is2PetscInt,isSignedChar,isUnsignedChar; 480 PetscInt nPetscIntContig,nPetscRealContig; 481 PetscMPIInt ni,na,nd,combiner; 482 #if defined(PETSC_HAVE_COMPLEX) 483 PetscBool isPetscComplex; 484 PetscInt nPetscComplexContig; 485 #endif 486 487 PetscFunctionBegin; 488 ierr = MPIPetsc_Type_compare(unit,MPI_SIGNED_CHAR,&isSignedChar);CHKERRQ(ierr); 489 ierr = MPIPetsc_Type_compare(unit,MPI_UNSIGNED_CHAR,&isUnsignedChar);CHKERRQ(ierr); 490 /* MPI_CHAR is treated below as a dumb block type that does not support reduction according to MPI standard */ 491 ierr = MPIPetsc_Type_compare(unit,MPI_INT,&isInt);CHKERRQ(ierr); 492 ierr = MPIPetsc_Type_compare(unit,MPIU_INT,&isPetscInt);CHKERRQ(ierr); 493 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT,&nPetscIntContig);CHKERRQ(ierr); 494 ierr = MPIPetsc_Type_compare(unit,MPIU_REAL,&isPetscReal);CHKERRQ(ierr); 495 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscRealContig);CHKERRQ(ierr); 496 #if defined(PETSC_HAVE_COMPLEX) 497 ierr = MPIPetsc_Type_compare(unit,MPIU_COMPLEX,&isPetscComplex);CHKERRQ(ierr); 498 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplexContig);CHKERRQ(ierr); 499 #endif 500 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 501 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 502 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 503 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; 504 link->bs = 1; 505 506 if (isSignedChar) {PackInit_SignedChar(link); PackInit_Logical_SignedChar(link); PackInit_Bitwise_SignedChar(link); link->basicunit = MPI_SIGNED_CHAR;} 507 else if (isUnsignedChar) {PackInit_UnsignedChar(link); PackInit_Logical_UnsignedChar(link); PackInit_Bitwise_UnsignedChar(link); link->basicunit = MPI_UNSIGNED_CHAR;} 508 else if (isInt) {PackInit_int(link); PackInit_Logical_int(link); PackInit_Bitwise_int(link); link->basicunit = MPI_INT;} 509 else if (isPetscInt) {PackInit_PetscInt(link); PackInit_Logical_PetscInt(link); PackInit_Bitwise_PetscInt(link); link->basicunit = MPIU_INT;} 510 else if (isPetscReal) {PackInit_PetscReal(link); PackInit_Logical_PetscReal(link); link->basicunit = MPIU_REAL;} 511 #if defined(PETSC_HAVE_COMPLEX) 512 else if (isPetscComplex) {PackInit_PetscComplex_1(link); link->basicunit = MPIU_COMPLEX;} 513 #endif 514 else if (is2Int) {PackInit_int_int(link); link->basicunit = MPI_2INT;} 515 else if (is2PetscInt) {PackInit_PetscInt_PetscInt(link); link->basicunit = MPIU_2INT;} 516 else if (nPetscIntContig) { 517 if (nPetscIntContig%7 == 0) PackInit_PetscInt_7(link); 518 else if (nPetscIntContig%5 == 0) PackInit_PetscInt_5(link); 519 else if (nPetscIntContig%4 == 0) PackInit_PetscInt_4(link); 520 else if (nPetscIntContig%3 == 0) PackInit_PetscInt_3(link); 521 else if (nPetscIntContig%2 == 0) PackInit_PetscInt_2(link); 522 else PackInit_PetscInt(link); 523 link->bs = nPetscIntContig; 524 link->unitbytes *= nPetscIntContig; 525 link->basicunit = MPIU_INT; 526 } else if (nPetscRealContig) { 527 if (nPetscRealContig%7 == 0) PackInit_PetscReal_7(link); 528 else if (nPetscRealContig%5 == 0) PackInit_PetscReal_5(link); 529 else if (nPetscRealContig%4 == 0) PackInit_PetscReal_4(link); 530 else if (nPetscRealContig%3 == 0) PackInit_PetscReal_3(link); 531 else if (nPetscRealContig%2 == 0) PackInit_PetscReal_2(link); 532 else PackInit_PetscReal(link); 533 link->bs = nPetscRealContig; 534 link->unitbytes *= nPetscRealContig; 535 link->basicunit = MPIU_REAL; 536 #if defined(PETSC_HAVE_COMPLEX) 537 } else if (nPetscComplexContig) { 538 if (nPetscComplexContig%7 == 0) PackInit_PetscComplex_7(link); 539 else if (nPetscComplexContig%5 == 0) PackInit_PetscComplex_5(link); 540 else if (nPetscComplexContig%4 == 0) PackInit_PetscComplex_4(link); 541 else if (nPetscComplexContig%3 == 0) PackInit_PetscComplex_3(link); 542 else if (nPetscComplexContig%2 == 0) PackInit_PetscComplex_2(link); 543 else PackInit_PetscComplex_1(link); 544 link->bs = nPetscComplexContig; 545 link->unitbytes *= nPetscComplexContig; 546 link->basicunit = MPIU_COMPLEX; 547 #endif 548 } else { 549 MPI_Aint lb,bytes; 550 ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr); 551 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 552 if (bytes % sizeof(int)) { /* If the type size is not multiple of int */ 553 if (bytes%4 == 0) {PackInit_block_char_4(link); link->bs = bytes/4;} /* Note the basic type is char[4] */ 554 else if (bytes%2 == 0) {PackInit_block_char_2(link); link->bs = bytes/2;} 555 else {PackInit_block_char_1(link); link->bs = bytes/1;} 556 link->unitbytes = bytes; 557 link->basicunit = MPI_CHAR; 558 } else { 559 PetscInt nInt = bytes / sizeof(int); 560 if (nInt%8 == 0) {PackInit_block_int_8(link); link->bs = nInt/8;} /* Note the basic type is int[8] */ 561 else if (nInt%4 == 0) {PackInit_block_int_4(link); link->bs = nInt/4;} 562 else if (nInt%2 == 0) {PackInit_block_int_2(link); link->bs = nInt/2;} 563 else {PackInit_block_int_1(link); link->bs = nInt/1;} 564 link->unitbytes = bytes; 565 link->basicunit = MPI_INT; 566 } 567 } 568 if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */ 569 else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 570 PetscFunctionReturn(0); 571 } 572 573 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*)) 574 { 575 PetscFunctionBegin; 576 *UnpackAndOp = NULL; 577 if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert; 578 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd; 579 else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult; 580 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax; 581 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin; 582 else if (op == MPI_LAND) *UnpackAndOp = link->UnpackAndLAND; 583 else if (op == MPI_BAND) *UnpackAndOp = link->UnpackAndBAND; 584 else if (op == MPI_LOR) *UnpackAndOp = link->UnpackAndLOR; 585 else if (op == MPI_BOR) *UnpackAndOp = link->UnpackAndBOR; 586 else if (op == MPI_LXOR) *UnpackAndOp = link->UnpackAndLXOR; 587 else if (op == MPI_BXOR) *UnpackAndOp = link->UnpackAndBXOR; 588 else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc; 589 else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc; 590 else *UnpackAndOp = NULL; 591 PetscFunctionReturn(0); 592 } 593 594 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*)) 595 { 596 PetscFunctionBegin; 597 *FetchAndOp = NULL; 598 if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert; 599 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd; 600 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax; 601 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin; 602 else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc; 603 else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc; 604 else if (op == MPI_PROD) *FetchAndOp = link->FetchAndMult; 605 else if (op == MPI_LAND) *FetchAndOp = link->FetchAndLAND; 606 else if (op == MPI_BAND) *FetchAndOp = link->FetchAndBAND; 607 else if (op == MPI_LOR) *FetchAndOp = link->FetchAndLOR; 608 else if (op == MPI_BOR) *FetchAndOp = link->FetchAndBOR; 609 else if (op == MPI_LXOR) *FetchAndOp = link->FetchAndLXOR; 610 else if (op == MPI_BXOR) *FetchAndOp = link->FetchAndBXOR; 611 else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op"); 612 PetscFunctionReturn(0); 613 } 614 615 /* 616 Setup pack/unpack optimization plans based on indice patterns available 617 618 Input Parameters: 619 + n - number of target processors 620 . offset - [n+1] for the i-th processor, its associated indices are idx[offset[i], offset[i+1]) 621 - idx - [] array storing indices. Its length is offset[n+1] 622 623 Output Parameters: 624 + opt - the optimization 625 */ 626 PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 627 { 628 PetscErrorCode ierr; 629 PetscInt i,j,k,n_copies,tot_copies=0,step; 630 PetscBool strided,has_strided=PETSC_FALSE,has_optimized=PETSC_FALSE; 631 PetscSFPackOpt opt; 632 633 PetscFunctionBegin; 634 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 635 ierr = PetscCalloc2(n,&opt->optimized,n+1,&opt->copy_offset);CHKERRQ(ierr); 636 637 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 638 for (i=0; i<n; i++) { /* for each target processor */ 639 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 640 n_copies = 1; 641 for (j=offset[i]; j<offset[i+1]-1; j++) { 642 if (idx[j]+1 != idx[j+1]) n_copies++; 643 } 644 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 645 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 646 */ 647 if ((offset[i+1]-offset[i])/n_copies >= 32) { 648 opt->optimized[i] = PETSC_TRUE; 649 has_optimized = PETSC_TRUE; 650 tot_copies += n_copies; 651 } 652 } 653 654 /* Setup memcpy plan for each contiguous piece */ 655 k = 0; /* k-th copy */ 656 ierr = PetscMalloc2(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length);CHKERRQ(ierr); 657 for (i=0; i<n; i++) { /* for each target processor procs[i] */ 658 if (opt->optimized[i]) { 659 n_copies = 1; 660 opt->copy_start[k] = idx[offset[i]]; 661 for (j=offset[i]; j<offset[i+1]-1; j++) { 662 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 663 n_copies++; 664 opt->copy_start[k+1] = idx[j+1]; 665 opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1; 666 k++; 667 } 668 } 669 /* Set copy length of the last copy for this target */ 670 opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1; 671 k++; 672 } 673 /* Set offset for next target. When optimized[i]=false, copy_offsets[i]=copy_offsets[i+1] */ 674 opt->copy_offset[i+1] = k; 675 } 676 677 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 678 ierr = PetscMalloc3(n,&opt->stride_first,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 679 for (i=0; i<n; i++) { /* for each remote */ 680 if (!opt->optimized[i] && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 681 strided = PETSC_TRUE; 682 step = idx[offset[i]+1] - idx[offset[i]]; 683 for (j=offset[i]; j<offset[i+1]-1; j++) { 684 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 685 } 686 if (strided) { 687 opt->optimized[i] = PETSC_TRUE; 688 opt->stride_first[i] = idx[offset[i]]; 689 opt->stride_step[i] = step; 690 opt->stride_n[i] = offset[i+1] - offset[i]; 691 has_strided = PETSC_TRUE; 692 has_optimized = PETSC_TRUE; 693 } 694 } 695 } 696 /* If no target has been stride-optimized or optimized, free related arrays to save memory */ 697 if (!has_strided) {ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);} 698 if (!has_optimized) { 699 ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr); 700 ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr); 701 ierr = PetscFree(opt);CHKERRQ(ierr); 702 *out = NULL; 703 } else *out = opt; 704 PetscFunctionReturn(0); 705 } 706 707 PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out) 708 { 709 PetscErrorCode ierr; 710 PetscSFPackOpt opt = *out; 711 712 PetscFunctionBegin; 713 if (opt) { 714 ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr); 715 ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr); 716 ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 717 ierr = PetscFree(opt);CHKERRQ(ierr); 718 *out = NULL; 719 } 720 PetscFunctionReturn(0); 721 } 722