1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> 7 #endif 8 /* 9 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 10 * therefore we pack data types manually. This file defines packing routines for the standard data types. 11 */ 12 13 #define CPPJoin2(a,b) a ##_## b 14 #define CPPJoin3(a,b,c) a ##_## b ##_## c 15 #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d 16 #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d) 17 18 #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 19 #define IGNORE(statement) do {} while(0) 20 21 #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 22 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 23 #define LXOR_OP(r,s,op,t) do {(r) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 24 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 25 26 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */ 27 28 /* DEF_PackFunc - macro defining a Pack routine 29 30 Arguments of the macro: 31 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 32 .BS Block size for vectorization. It is a factor of bs. 33 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 34 35 Arguments of the Pack routine: 36 +count Number of indices in idx[] 37 .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count) 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .opt Pack optimization plans. NULL means no plan at all. 40 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count) 41 -packed Address of the packed data for each rank 42 */ 43 #define DEF_PackFunc(Type,BS,EQ) \ 44 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45 { \ 46 PetscErrorCode ierr; \ 47 const Type *u = (const Type*)unpacked,*u2; \ 48 Type *p = (Type*)packed,*p2; \ 49 PetscInt i,j,k,l,r,step,bs=link->bs; \ 50 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 52 PetscFunctionBegin; \ 53 if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54 else if (!opt) { /* No optimizations available */ \ 55 for (i=0; i<count; i++) \ 56 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59 } else { \ 60 for (r=0; r<opt->n; r++) { \ 61 p2 = p + opt->offset[r]*MBS; \ 62 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63 idx2 = idx + opt->offset[r]; \ 64 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65 for (j=0; j<M; j++) \ 66 for (k=0; k<BS; k++) \ 67 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 69 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70 u2 = u + idx[opt->copy_start[i]]*MBS; \ 71 l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72 ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73 p2 += l; \ 74 } \ 75 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76 u2 = u + idx[opt->offset[r]]*MBS; \ 77 step = opt->stride_step[r]; \ 78 for (i=0; i<opt->stride_n[r]; i++) \ 79 for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 81 } \ 82 } \ 83 PetscFunctionReturn(0); \ 84 } 85 86 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 87 88 Arguments: 89 +action Unpack or Fetch 90 .Type Type of the data 91 .BS Block size for vectorization 92 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 93 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 94 .CType Type with or without the const qualifier, i.e., const Type or Type 95 .Cvoid void with or without the const qualifier, i.e., const void or void 96 97 Notes: 98 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 99 The two arguments CType and Cvoid are used (instead of one constness argument), because we want to 100 get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 101 sometimes we input 'const', sometimes we have to input empty. 102 103 If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare. 104 */ 105 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \ 106 static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 107 { \ 108 PetscErrorCode ierr; \ 109 Type *u = (Type*)unpacked,*u2; \ 110 CType *p = (CType*)packed,*p2; \ 111 PetscInt i,j,k,l,r,step,bs=link->bs; \ 112 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 113 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 114 PetscFunctionBegin; \ 115 if (!idx) { \ 116 FILTER(Type *v); \ 117 FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \ 118 FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \ 119 ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr); \ 120 FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \ 121 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 122 } else if (!opt) { /* No optimizations available */ \ 123 for (i=0; i<count; i++) \ 124 for (j=0; j<M; j++) \ 125 for (k=0; k<BS; k++) { \ 126 FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \ 127 u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 128 FILTER(p[i*MBS+j*BS+k] = t); \ 129 } \ 130 } else { \ 131 for (r=0; r<opt->n; r++) { \ 132 p2 = p + opt->offset[r]*MBS; \ 133 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 134 idx2 = idx + opt->offset[r]; \ 135 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 136 for (j=0; j<M; j++) \ 137 for (k=0; k<BS; k++) { \ 138 FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \ 139 u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 140 FILTER(p2[i*MBS+j*BS+k] = t); \ 141 } \ 142 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 143 FILTER(Type *v); \ 144 FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \ 145 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 146 u2 = u + idx[opt->copy_start[i]]*MBS; \ 147 l = opt->copy_length[i]*MBS; \ 148 FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 149 ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 150 FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \ 151 p2 += l; \ 152 } \ 153 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 154 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 155 u2 = u + idx[opt->offset[r]]*MBS; \ 156 step = opt->stride_step[r]; \ 157 for (i=0; i<opt->stride_n[r]; i++) \ 158 for (j=0; j<M; j++) \ 159 for (k=0; k<BS; k++) { \ 160 FILTER(Type t = u2[i*step*MBS+j*BS+k]); \ 161 u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 162 FILTER(p2[i*MBS+j*BS+k] = t); \ 163 } \ 164 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 165 } \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 171 172 Arguments: 173 +action Unpack or Fetch 174 .opname Name of the Op, such as Add, Mult, LAND, etc. 175 .Type Type of the data 176 .BS Block size for vectorization 177 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 178 .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 179 .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 180 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 181 .CType Type with or without the const qualifier, i.e., const Type or Type 182 -Cvoid void with or without the const qualifier, i.e., const void or void 183 */ 184 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \ 185 static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 186 { \ 187 Type *u = (Type*)unpacked,*u2,t; \ 188 CType *p = (CType*)packed,*p2; \ 189 PetscInt i,j,k,l,r,step,bs=link->bs; \ 190 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 191 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 192 PetscFunctionBegin; \ 193 if (!idx) { \ 194 for (i=0; i<count; i++) \ 195 for (j=0; j<M; j++) \ 196 for (k=0; k<BS; k++) { \ 197 t = u[i*MBS+j*BS+k]; \ 198 APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 199 FILTER(p[i*MBS+j*BS+k] = t); \ 200 } \ 201 } else if (!opt) { /* No optimizations available */ \ 202 for (i=0; i<count; i++) \ 203 for (j=0; j<M; j++) \ 204 for (k=0; k<BS; k++) { \ 205 t = u[idx[i]*MBS+j*BS+k]; \ 206 APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 207 FILTER(p[i*MBS+j*BS+k] = t); \ 208 } \ 209 } else { \ 210 for (r=0; r<opt->n; r++) { \ 211 p2 = p + opt->offset[r]*MBS; \ 212 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 213 idx2 = idx + opt->offset[r]; \ 214 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 215 for (j=0; j<M; j++) \ 216 for (k=0; k<BS; k++) { \ 217 t = u[idx2[i]*MBS+j*BS+k]; \ 218 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 219 FILTER(p2[i*MBS+j*BS+k] = t); \ 220 } \ 221 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 222 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 223 u2 = u + idx[opt->copy_start[i]]*MBS; \ 224 l = opt->copy_length[i]*MBS; \ 225 for (j=0; j<l; j++) { \ 226 t = u2[j]; \ 227 APPLY (u2[j],t,op,p2[j]); \ 228 FILTER(p2[j] = t); \ 229 } \ 230 p2 += l; \ 231 } \ 232 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 233 u2 = u + idx[opt->offset[r]]*MBS; \ 234 step = opt->stride_step[r]; \ 235 for (i=0; i<opt->stride_n[r]; i++) \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 t = u2[i*step*MBS+j*BS+k]; \ 239 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 240 FILTER(p2[i*MBS+j*BS+k] = t); \ 241 } \ 242 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 243 } \ 244 } \ 245 PetscFunctionReturn(0); \ 246 } 247 248 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 249 250 Arguments: 251 +Action Unpack or Fetch 252 .locname Max or Min 253 .type1 Type of the first data in a pair type 254 .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 255 .op > or < 256 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 257 .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2) 258 -Cvoid void with or without the const qualifier, i.e., const void or void 259 */ 260 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \ 261 static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \ 262 PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \ 263 CType *p = (CType*)packed; \ 264 PetscInt i,j; \ 265 for (i=0; i<count; i++) { \ 266 FILTER(PairType(Type1,Type2) v); \ 267 j = idx? idx[i] : i; \ 268 FILTER(v = u[j]); \ 269 if (p[i].a op u[j].a) { \ 270 u[j] = p[i]; \ 271 } else if (p[i].a == u[j].a) { \ 272 u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 273 } \ 274 FILTER(p[i] = v); \ 275 } \ 276 PetscFunctionReturn(0); \ 277 } 278 279 /* Pack, Unpack/Fetch ops */ 280 #define DEF_Pack(Type,BS,EQ) \ 281 DEF_PackFunc(Type,BS,EQ) \ 282 DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \ 283 DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \ 284 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \ 285 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 286 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 287 link->h_FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \ 288 } 289 290 /* Add, Mult ops */ 291 #define DEF_Add(Type,BS,EQ) \ 292 DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \ 293 DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \ 294 DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \ 295 DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \ 296 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \ 297 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 298 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 299 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 300 link->h_FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \ 301 } 302 303 /* Max, Min ops */ 304 #define DEF_Cmp(Type,BS,EQ) \ 305 DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \ 306 DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \ 307 DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \ 308 DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \ 309 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \ 310 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 311 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 312 link->h_FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \ 313 link->h_FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \ 314 } 315 316 /* Logical ops. 317 The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 318 the compilation warning "empty macro arguments are undefined in ISO C90" 319 */ 320 #define DEF_Log(Type,BS,EQ) \ 321 DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \ 322 DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \ 323 DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \ 324 DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \ 325 DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \ 326 DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \ 327 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \ 328 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \ 329 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 330 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \ 331 link->h_FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \ 332 link->h_FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \ 333 link->h_FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \ 334 } 335 336 /* Bitwise ops */ 337 #define DEF_Bit(Type,BS,EQ) \ 338 DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \ 339 DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \ 340 DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \ 341 DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \ 342 DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \ 343 DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \ 344 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \ 345 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \ 346 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 347 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \ 348 link->h_FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \ 349 link->h_FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \ 350 link->h_FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \ 351 } 352 353 /* Maxloc, Minloc */ 354 #define DEF_Xloc(Type1,Type2) \ 355 DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \ 356 DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \ 357 DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \ 358 DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \ 359 static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \ 360 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \ 361 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \ 362 link->h_FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \ 363 link->h_FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \ 364 } 365 366 #define DEF_IntegerType(Type,BS,EQ) \ 367 DEF_Pack(Type,BS,EQ) \ 368 DEF_Add(Type,BS,EQ) \ 369 DEF_Cmp(Type,BS,EQ) \ 370 DEF_Log(Type,BS,EQ) \ 371 DEF_Bit(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 375 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 376 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 377 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 378 } 379 380 #define DEF_RealType(Type,BS,EQ) \ 381 DEF_Pack(Type,BS,EQ) \ 382 DEF_Add(Type,BS,EQ) \ 383 DEF_Cmp(Type,BS,EQ) \ 384 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \ 385 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 386 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 387 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 388 } 389 390 #if defined(PETSC_HAVE_COMPLEX) 391 #define DEF_ComplexType(Type,BS,EQ) \ 392 DEF_Pack(Type,BS,EQ) \ 393 DEF_Add(Type,BS,EQ) \ 394 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \ 395 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 396 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 397 } 398 #endif 399 400 #define DEF_DumbType(Type,BS,EQ) \ 401 DEF_Pack(Type,BS,EQ) \ 402 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \ 403 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 404 } 405 406 /* Maxloc, Minloc */ 407 #define DEF_PairType(Type1,Type2) \ 408 typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \ 409 DEF_Pack(PairType(Type1,Type2),1,1) \ 410 DEF_Xloc(Type1,Type2) \ 411 static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \ 412 CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \ 413 CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \ 414 } 415 416 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 417 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 418 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 419 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 420 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 421 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 422 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 423 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 424 425 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 426 DEF_IntegerType(int,1,1) 427 DEF_IntegerType(int,2,1) 428 DEF_IntegerType(int,4,1) 429 DEF_IntegerType(int,8,1) 430 DEF_IntegerType(int,1,0) 431 DEF_IntegerType(int,2,0) 432 DEF_IntegerType(int,4,0) 433 DEF_IntegerType(int,8,0) 434 #endif 435 436 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 437 typedef signed char SignedChar; 438 DEF_IntegerType(SignedChar,1,1) 439 DEF_IntegerType(SignedChar,2,1) 440 DEF_IntegerType(SignedChar,4,1) 441 DEF_IntegerType(SignedChar,8,1) 442 DEF_IntegerType(SignedChar,1,0) 443 DEF_IntegerType(SignedChar,2,0) 444 DEF_IntegerType(SignedChar,4,0) 445 DEF_IntegerType(SignedChar,8,0) 446 447 typedef unsigned char UnsignedChar; 448 DEF_IntegerType(UnsignedChar,1,1) 449 DEF_IntegerType(UnsignedChar,2,1) 450 DEF_IntegerType(UnsignedChar,4,1) 451 DEF_IntegerType(UnsignedChar,8,1) 452 DEF_IntegerType(UnsignedChar,1,0) 453 DEF_IntegerType(UnsignedChar,2,0) 454 DEF_IntegerType(UnsignedChar,4,0) 455 DEF_IntegerType(UnsignedChar,8,0) 456 457 DEF_RealType(PetscReal,1,1) 458 DEF_RealType(PetscReal,2,1) 459 DEF_RealType(PetscReal,4,1) 460 DEF_RealType(PetscReal,8,1) 461 DEF_RealType(PetscReal,1,0) 462 DEF_RealType(PetscReal,2,0) 463 DEF_RealType(PetscReal,4,0) 464 DEF_RealType(PetscReal,8,0) 465 466 #if defined(PETSC_HAVE_COMPLEX) 467 DEF_ComplexType(PetscComplex,1,1) 468 DEF_ComplexType(PetscComplex,2,1) 469 DEF_ComplexType(PetscComplex,4,1) 470 DEF_ComplexType(PetscComplex,8,1) 471 DEF_ComplexType(PetscComplex,1,0) 472 DEF_ComplexType(PetscComplex,2,0) 473 DEF_ComplexType(PetscComplex,4,0) 474 DEF_ComplexType(PetscComplex,8,0) 475 #endif 476 477 DEF_PairType(int,int) 478 DEF_PairType(PetscInt,PetscInt) 479 480 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 481 DEF_DumbType(char,1,1) 482 DEF_DumbType(char,2,1) 483 DEF_DumbType(char,4,1) 484 DEF_DumbType(char,1,0) 485 DEF_DumbType(char,2,0) 486 DEF_DumbType(char,4,0) 487 488 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 489 DEF_DumbType(DumbInt,1,1) 490 DEF_DumbType(DumbInt,2,1) 491 DEF_DumbType(DumbInt,4,1) 492 DEF_DumbType(DumbInt,8,1) 493 DEF_DumbType(DumbInt,1,0) 494 DEF_DumbType(DumbInt,2,0) 495 DEF_DumbType(DumbInt,4,0) 496 DEF_DumbType(DumbInt,8,0) 497 498 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 499 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 500 { 501 int ierr; 502 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 503 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 504 return MPI_SUCCESS; 505 } 506 #endif 507 508 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink) 509 { 510 PetscErrorCode ierr; 511 PetscSFPack link,*p; 512 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 513 514 PetscFunctionBegin; 515 /* Look for types in cache */ 516 for (p=&bas->inuse; (link=*p); p=&link->next) { 517 PetscBool match; 518 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 519 if (match && (rkey == link->rkey) && (lkey == link->lkey)) { 520 switch (cmode) { 521 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 522 case PETSC_USE_POINTER: break; 523 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 524 } 525 *mylink = link; 526 PetscFunctionReturn(0); 527 } 528 } 529 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 530 PetscFunctionReturn(0); 531 } 532 533 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 534 { 535 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 536 537 PetscFunctionBegin; 538 (*link)->rkey = NULL; 539 (*link)->lkey = NULL; 540 (*link)->next = bas->avail; 541 bas->avail = *link; 542 *link = NULL; 543 PetscFunctionReturn(0); 544 } 545 546 /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */ 547 PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack *avail) 548 { 549 PetscErrorCode ierr; 550 PetscSFPack link=*avail,next; 551 PetscInt i; 552 553 PetscFunctionBegin; 554 for (; link; link=next) { 555 next = link->next; 556 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 557 for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */ 558 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 559 } 560 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 561 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 562 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 563 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 564 565 #if defined(PETSC_HAVE_CUDA) 566 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 567 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 568 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 569 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 570 #endif 571 ierr = PetscFree(link);CHKERRQ(ierr); 572 } 573 *avail = NULL; 574 PetscFunctionReturn(0); 575 } 576 577 /* Error out on unsupported overlapped communications */ 578 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey) 579 { 580 PetscErrorCode ierr; 581 PetscSFPack link,*p; 582 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 583 PetscBool match; 584 585 PetscFunctionBegin; 586 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 587 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 588 */ 589 if (rkey || lkey) { 590 for (p=&bas->inuse; (link=*p); p=&link->next) { 591 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 592 if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey); 593 } 594 } 595 PetscFunctionReturn(0); 596 } 597 598 PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit) 599 { 600 PetscErrorCode ierr; 601 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 602 PetscBool is2Int,is2PetscInt; 603 PetscMPIInt ni,na,nd,combiner; 604 #if defined(PETSC_HAVE_COMPLEX) 605 PetscInt nPetscComplex=0; 606 #endif 607 608 PetscFunctionBegin; 609 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 610 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 611 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 612 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 613 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 614 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 615 #if defined(PETSC_HAVE_COMPLEX) 616 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 617 #endif 618 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 619 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 620 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 621 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 622 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 623 link->bs = 1; /* default */ 624 625 if (is2Int) { 626 PackInit_PairType_int_int(link); 627 link->bs = 1; 628 link->unitbytes = 2*sizeof(int); 629 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 630 link->basicunit = MPI_2INT; 631 link->unit = MPI_2INT; 632 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 633 PackInit_PairType_PetscInt_PetscInt(link); 634 link->bs = 1; 635 link->unitbytes = 2*sizeof(PetscInt); 636 link->basicunit = MPIU_2INT; 637 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 638 link->unit = MPIU_2INT; 639 } else if (nPetscReal) { 640 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 641 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 642 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 643 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 644 link->bs = nPetscReal; 645 link->unitbytes = nPetscReal*sizeof(PetscReal); 646 link->basicunit = MPIU_REAL; 647 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 648 } else if (nPetscInt) { 649 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 650 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 651 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 652 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 653 link->bs = nPetscInt; 654 link->unitbytes = nPetscInt*sizeof(PetscInt); 655 link->basicunit = MPIU_INT; 656 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 657 #if defined(PETSC_USE_64BIT_INDICES) 658 } else if (nInt) { 659 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 660 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 661 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 662 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 663 link->bs = nInt; 664 link->unitbytes = nInt*sizeof(int); 665 link->basicunit = MPI_INT; 666 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 667 #endif 668 } else if (nSignedChar) { 669 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 670 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 671 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 672 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 673 link->bs = nSignedChar; 674 link->unitbytes = nSignedChar*sizeof(SignedChar); 675 link->basicunit = MPI_SIGNED_CHAR; 676 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 677 } else if (nUnsignedChar) { 678 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 679 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 680 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 681 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 682 link->bs = nUnsignedChar; 683 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 684 link->basicunit = MPI_UNSIGNED_CHAR; 685 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 686 #if defined(PETSC_HAVE_COMPLEX) 687 } else if (nPetscComplex) { 688 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 689 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 690 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 691 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 692 link->bs = nPetscComplex; 693 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 694 link->basicunit = MPIU_COMPLEX; 695 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 696 #endif 697 } else { 698 MPI_Aint lb,nbyte; 699 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 700 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 701 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 702 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 703 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 704 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 705 link->bs = nbyte; 706 link->unitbytes = nbyte; 707 link->basicunit = MPI_BYTE; 708 } else { 709 nInt = nbyte / sizeof(int); 710 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 711 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 712 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 713 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 714 link->bs = nInt; 715 link->unitbytes = nbyte; 716 link->basicunit = MPI_INT; 717 } 718 if (link->isbuiltin) link->unit = unit; 719 } 720 721 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 722 PetscFunctionReturn(0); 723 } 724 725 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*)) 726 { 727 PetscFunctionBegin; 728 *UnpackAndOp = NULL; 729 if (mtype == PETSC_MEMTYPE_HOST) { 730 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 731 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 732 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 733 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 734 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 735 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 736 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 737 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 738 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 739 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 740 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 741 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 742 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 743 } 744 #if defined(PETSC_HAVE_CUDA) 745 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 746 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 747 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 748 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 749 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 750 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 751 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 752 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 753 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 754 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 755 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 756 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 757 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 758 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 759 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 760 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 761 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 762 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 763 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 764 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 765 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 766 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 767 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 768 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 769 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 770 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 771 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 772 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 773 } 774 #endif 775 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 776 777 PetscFunctionReturn(0); 778 } 779 780 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*)) 781 { 782 PetscFunctionBegin; 783 *FetchAndOp = NULL; 784 if (mtype == PETSC_MEMTYPE_HOST) { 785 if (op == MPIU_REPLACE) *FetchAndOp = link->h_FetchAndInsert; 786 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd; 787 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax; 788 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin; 789 else if (op == MPI_MAXLOC) *FetchAndOp = link->h_FetchAndMaxloc; 790 else if (op == MPI_MINLOC) *FetchAndOp = link->h_FetchAndMinloc; 791 else if (op == MPI_PROD) *FetchAndOp = link->h_FetchAndMult; 792 else if (op == MPI_LAND) *FetchAndOp = link->h_FetchAndLAND; 793 else if (op == MPI_BAND) *FetchAndOp = link->h_FetchAndBAND; 794 else if (op == MPI_LOR) *FetchAndOp = link->h_FetchAndLOR; 795 else if (op == MPI_BOR) *FetchAndOp = link->h_FetchAndBOR; 796 else if (op == MPI_LXOR) *FetchAndOp = link->h_FetchAndLXOR; 797 else if (op == MPI_BXOR) *FetchAndOp = link->h_FetchAndBXOR; 798 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 799 } 800 #if defined(PETSC_HAVE_CUDA) 801 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 802 if (op == MPIU_REPLACE) *FetchAndOp = link->d_FetchAndInsert; 803 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd; 804 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax; 805 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin; 806 else if (op == MPI_MAXLOC) *FetchAndOp = link->d_FetchAndMaxloc; 807 else if (op == MPI_MINLOC) *FetchAndOp = link->d_FetchAndMinloc; 808 else if (op == MPI_PROD) *FetchAndOp = link->d_FetchAndMult; 809 else if (op == MPI_LAND) *FetchAndOp = link->d_FetchAndLAND; 810 else if (op == MPI_BAND) *FetchAndOp = link->d_FetchAndBAND; 811 else if (op == MPI_LOR) *FetchAndOp = link->d_FetchAndLOR; 812 else if (op == MPI_BOR) *FetchAndOp = link->d_FetchAndBOR; 813 else if (op == MPI_LXOR) *FetchAndOp = link->d_FetchAndLXOR; 814 else if (op == MPI_BXOR) *FetchAndOp = link->d_FetchAndBXOR; 815 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 816 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 817 if (op == MPIU_REPLACE) *FetchAndOp = link->da_FetchAndInsert; 818 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd; 819 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax; 820 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin; 821 else if (op == MPI_MAXLOC) *FetchAndOp = link->da_FetchAndMaxloc; 822 else if (op == MPI_MINLOC) *FetchAndOp = link->da_FetchAndMinloc; 823 else if (op == MPI_PROD) *FetchAndOp = link->da_FetchAndMult; 824 else if (op == MPI_LAND) *FetchAndOp = link->da_FetchAndLAND; 825 else if (op == MPI_BAND) *FetchAndOp = link->da_FetchAndBAND; 826 else if (op == MPI_LOR) *FetchAndOp = link->da_FetchAndLOR; 827 else if (op == MPI_BOR) *FetchAndOp = link->da_FetchAndBOR; 828 else if (op == MPI_LXOR) *FetchAndOp = link->da_FetchAndLXOR; 829 else if (op == MPI_BXOR) *FetchAndOp = link->da_FetchAndBXOR; 830 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 831 } 832 #endif 833 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 834 PetscFunctionReturn(0); 835 } 836 837 /* 838 Create pack/unpack optimization plans based on indice patterns available 839 840 Input Parameters: 841 + n - Number of target ranks 842 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 843 - idx - [*] Array storing indices 844 845 Output Parameters: 846 + opt - Optimization plans. Maybe NULL if no optimization can be built. 847 */ 848 PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 849 { 850 PetscErrorCode ierr; 851 PetscInt i,j,k,n_copies,tot_copies=0,step; 852 PetscBool strided,optimized=PETSC_FALSE; 853 PetscSFPackOpt opt; 854 855 PetscFunctionBegin; 856 if (!n) { 857 *out = NULL; 858 PetscFunctionReturn(0); 859 } 860 861 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 862 ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 863 ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 864 if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/ 865 866 opt->n = n; 867 868 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 869 for (i=0; i<n; i++) { /* for each target processor */ 870 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 871 n_copies = 1; 872 for (j=offset[i]; j<offset[i+1]-1; j++) { 873 if (idx[j]+1 != idx[j+1]) n_copies++; 874 } 875 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 876 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 877 */ 878 if ((offset[i+1]-offset[i])/n_copies >= 32) { 879 opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 880 optimized = PETSC_TRUE; 881 tot_copies += n_copies; 882 } 883 } 884 885 /* Setup memcpy plan for each contiguous piece */ 886 k = 0; /* k-th copy */ 887 ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 888 for (i=0; i<n; i++) { /* for each target processor */ 889 if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 890 n_copies = 1; 891 opt->copy_start[k] = offset[i] - offset[0]; 892 for (j=offset[i]; j<offset[i+1]-1; j++) { 893 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 894 n_copies++; 895 opt->copy_start[k+1] = j-offset[0]+1; 896 opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 897 k++; 898 } 899 } 900 /* Set copy length of the last copy for this target */ 901 opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 902 k++; 903 } 904 /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 905 opt->copy_offset[i+1] = k; 906 } 907 908 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 909 for (i=0; i<n; i++) { /* for each remote */ 910 if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 911 strided = PETSC_TRUE; 912 step = idx[offset[i]+1] - idx[offset[i]]; 913 for (j=offset[i]; j<offset[i+1]-1; j++) { 914 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 915 } 916 if (strided) { 917 opt->type[i] = PETSCSF_PACKOPT_STRIDE; 918 opt->stride_step[i] = step; 919 opt->stride_n[i] = offset[i+1] - offset[i]; 920 optimized = PETSC_TRUE; 921 } 922 } 923 } 924 /* If no rank gets optimized, free arrays to save memory */ 925 if (!optimized) { 926 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 927 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 928 ierr = PetscFree(opt);CHKERRQ(ierr); 929 *out = NULL; 930 } else *out = opt; 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out) 935 { 936 PetscErrorCode ierr; 937 PetscSFPackOpt opt = *out; 938 939 PetscFunctionBegin; 940 if (opt) { 941 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 942 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 943 ierr = PetscFree(opt);CHKERRQ(ierr); 944 *out = NULL; 945 } 946 PetscFunctionReturn(0); 947 } 948