1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> 7 #endif 8 /* 9 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 10 * therefore we pack data types manually. This file defines packing routines for the standard data types. 11 */ 12 13 #define CPPJoin2(a,b) a ##_## b 14 #define CPPJoin3(a,b,c) a ##_## b ##_## c 15 #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d 16 #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d) 17 18 #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 19 #define IGNORE(statement) do {} while(0) 20 21 #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 22 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 23 #define LXOR_OP(r,s,op,t) do {(r) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 24 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 25 26 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */ 27 28 /* DEF_PackFunc - macro defining a Pack routine 29 30 Arguments of the macro: 31 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 32 .BS Block size for vectorization. It is a factor of bs. 33 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 34 35 Arguments of the Pack routine: 36 +count Number of indices in idx[] 37 .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count) 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .opt Pack optimization plans. NULL means no plan at all. 40 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count) 41 -packed Address of the packed data for each rank 42 */ 43 #define DEF_PackFunc(Type,BS,EQ) \ 44 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45 { \ 46 PetscErrorCode ierr; \ 47 const Type *u = (const Type*)unpacked,*u2; \ 48 Type *p = (Type*)packed,*p2; \ 49 PetscInt i,j,k,l,r,step,bs=link->bs; \ 50 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 52 PetscFunctionBegin; \ 53 if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54 else if (!opt) { /* No optimizations available */ \ 55 for (i=0; i<count; i++) \ 56 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59 } else { \ 60 for (r=0; r<opt->n; r++) { \ 61 p2 = p + opt->offset[r]*MBS; \ 62 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63 idx2 = idx + opt->offset[r]; \ 64 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65 for (j=0; j<M; j++) \ 66 for (k=0; k<BS; k++) \ 67 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 69 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70 u2 = u + idx[opt->copy_start[i]]*MBS; \ 71 l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72 ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73 p2 += l; \ 74 } \ 75 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76 u2 = u + idx[opt->offset[r]]*MBS; \ 77 step = opt->stride_step[r]; \ 78 for (i=0; i<opt->stride_n[r]; i++) \ 79 for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 81 } \ 82 } \ 83 PetscFunctionReturn(0); \ 84 } 85 86 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 87 88 Arguments: 89 +action Unpack or Fetch 90 .Type Type of the data 91 .BS Block size for vectorization 92 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 93 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 94 .CType Type with or without the const qualifier, i.e., const Type or Type 95 .Cvoid void with or without the const qualifier, i.e., const void or void 96 97 Notes: 98 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 99 The two arguments CType and Cvoid are used (instead of one constness argument), because we want to 100 get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 101 sometimes we input 'const', sometimes we have to input empty. 102 103 If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare. 104 */ 105 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \ 106 static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 107 { \ 108 PetscErrorCode ierr; \ 109 Type *u = (Type*)unpacked,*u2; \ 110 CType *p = (CType*)packed,*p2; \ 111 PetscInt i,j,k,l,r,step,bs=link->bs; \ 112 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 113 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 114 PetscFunctionBegin; \ 115 if (!idx) { \ 116 FILTER(Type *v); \ 117 FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \ 118 FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \ 119 ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr); \ 120 FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \ 121 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 122 } else if (!opt) { /* No optimizations available */ \ 123 for (i=0; i<count; i++) \ 124 for (j=0; j<M; j++) \ 125 for (k=0; k<BS; k++) { \ 126 FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \ 127 u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 128 FILTER(p[i*MBS+j*BS+k] = t); \ 129 } \ 130 } else { \ 131 for (r=0; r<opt->n; r++) { \ 132 p2 = p + opt->offset[r]*MBS; \ 133 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 134 idx2 = idx + opt->offset[r]; \ 135 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 136 for (j=0; j<M; j++) \ 137 for (k=0; k<BS; k++) { \ 138 FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \ 139 u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 140 FILTER(p2[i*MBS+j*BS+k] = t); \ 141 } \ 142 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 143 FILTER(Type *v); \ 144 FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \ 145 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 146 u2 = u + idx[opt->copy_start[i]]*MBS; \ 147 l = opt->copy_length[i]*MBS; \ 148 FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 149 ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 150 FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \ 151 p2 += l; \ 152 } \ 153 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 154 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 155 u2 = u + idx[opt->offset[r]]*MBS; \ 156 step = opt->stride_step[r]; \ 157 for (i=0; i<opt->stride_n[r]; i++) \ 158 for (j=0; j<M; j++) \ 159 for (k=0; k<BS; k++) { \ 160 FILTER(Type t = u2[i*step*MBS+j*BS+k]); \ 161 u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 162 FILTER(p2[i*MBS+j*BS+k] = t); \ 163 } \ 164 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 165 } \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 171 172 Arguments: 173 +action Unpack or Fetch 174 .opname Name of the Op, such as Add, Mult, LAND, etc. 175 .Type Type of the data 176 .BS Block size for vectorization 177 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 178 .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 179 .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 180 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 181 .CType Type with or without the const qualifier, i.e., const Type or Type 182 -Cvoid void with or without the const qualifier, i.e., const void or void 183 */ 184 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \ 185 static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 186 { \ 187 Type *u = (Type*)unpacked,*u2,t; \ 188 CType *p = (CType*)packed,*p2; \ 189 PetscInt i,j,k,l,r,step,bs=link->bs; \ 190 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 191 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 192 PetscFunctionBegin; \ 193 if (!idx) { \ 194 for (i=0; i<count; i++) \ 195 for (j=0; j<M; j++) \ 196 for (k=0; k<BS; k++) { \ 197 t = u[i*MBS+j*BS+k]; \ 198 APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 199 FILTER(p[i*MBS+j*BS+k] = t); \ 200 } \ 201 } else if (!opt) { /* No optimizations available */ \ 202 for (i=0; i<count; i++) \ 203 for (j=0; j<M; j++) \ 204 for (k=0; k<BS; k++) { \ 205 t = u[idx[i]*MBS+j*BS+k]; \ 206 APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 207 FILTER(p[i*MBS+j*BS+k] = t); \ 208 } \ 209 } else { \ 210 for (r=0; r<opt->n; r++) { \ 211 p2 = p + opt->offset[r]*MBS; \ 212 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 213 idx2 = idx + opt->offset[r]; \ 214 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 215 for (j=0; j<M; j++) \ 216 for (k=0; k<BS; k++) { \ 217 t = u[idx2[i]*MBS+j*BS+k]; \ 218 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 219 FILTER(p2[i*MBS+j*BS+k] = t); \ 220 } \ 221 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 222 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 223 u2 = u + idx[opt->copy_start[i]]*MBS; \ 224 l = opt->copy_length[i]*MBS; \ 225 for (j=0; j<l; j++) { \ 226 t = u2[j]; \ 227 APPLY (u2[j],t,op,p2[j]); \ 228 FILTER(p2[j] = t); \ 229 } \ 230 p2 += l; \ 231 } \ 232 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 233 u2 = u + idx[opt->offset[r]]*MBS; \ 234 step = opt->stride_step[r]; \ 235 for (i=0; i<opt->stride_n[r]; i++) \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 t = u2[i*step*MBS+j*BS+k]; \ 239 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 240 FILTER(p2[i*MBS+j*BS+k] = t); \ 241 } \ 242 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 243 } \ 244 } \ 245 PetscFunctionReturn(0); \ 246 } 247 248 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 249 250 Arguments: 251 +Action Unpack or Fetch 252 .locname Max or Min 253 .type1 Type of the first data in a pair type 254 .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 255 .op > or < 256 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 257 .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2) 258 -Cvoid void with or without the const qualifier, i.e., const void or void 259 */ 260 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \ 261 static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \ 262 PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \ 263 CType *p = (CType*)packed; \ 264 PetscInt i,j; \ 265 for (i=0; i<count; i++) { \ 266 FILTER(PairType(Type1,Type2) v); \ 267 j = idx? idx[i] : i; \ 268 FILTER(v = u[j]); \ 269 if (p[i].a op u[j].a) { \ 270 u[j] = p[i]; \ 271 } else if (p[i].a == u[j].a) { \ 272 u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 273 } \ 274 FILTER(p[i] = v); \ 275 } \ 276 PetscFunctionReturn(0); \ 277 } 278 279 /* Pack, Unpack/Fetch ops */ 280 #define DEF_Pack(Type,BS,EQ) \ 281 DEF_PackFunc(Type,BS,EQ) \ 282 DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \ 283 DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \ 284 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \ 285 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 286 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 287 link->h_FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \ 288 } 289 290 /* Add, Mult ops */ 291 #define DEF_Add(Type,BS,EQ) \ 292 DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \ 293 DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \ 294 DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \ 295 DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \ 296 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \ 297 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 298 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 299 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 300 link->h_FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \ 301 } 302 303 /* Max, Min ops */ 304 #define DEF_Cmp(Type,BS,EQ) \ 305 DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \ 306 DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \ 307 DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \ 308 DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \ 309 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \ 310 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 311 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 312 link->h_FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \ 313 link->h_FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \ 314 } 315 316 /* Logical ops. 317 The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 318 the compilation warning "empty macro arguments are undefined in ISO C90" 319 */ 320 #define DEF_Log(Type,BS,EQ) \ 321 DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \ 322 DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \ 323 DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \ 324 DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \ 325 DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \ 326 DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \ 327 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \ 328 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \ 329 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 330 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \ 331 link->h_FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \ 332 link->h_FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \ 333 link->h_FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \ 334 } 335 336 /* Bitwise ops */ 337 #define DEF_Bit(Type,BS,EQ) \ 338 DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \ 339 DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \ 340 DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \ 341 DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \ 342 DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \ 343 DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \ 344 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \ 345 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \ 346 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 347 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \ 348 link->h_FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \ 349 link->h_FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \ 350 link->h_FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \ 351 } 352 353 /* Maxloc, Minloc */ 354 #define DEF_Xloc(Type1,Type2) \ 355 DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \ 356 DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \ 357 DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \ 358 DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \ 359 static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \ 360 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \ 361 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \ 362 link->h_FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \ 363 link->h_FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \ 364 } 365 366 #define DEF_IntegerType(Type,BS,EQ) \ 367 DEF_Pack(Type,BS,EQ) \ 368 DEF_Add(Type,BS,EQ) \ 369 DEF_Cmp(Type,BS,EQ) \ 370 DEF_Log(Type,BS,EQ) \ 371 DEF_Bit(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 375 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 376 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 377 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 378 } 379 380 #define DEF_RealType(Type,BS,EQ) \ 381 DEF_Pack(Type,BS,EQ) \ 382 DEF_Add(Type,BS,EQ) \ 383 DEF_Cmp(Type,BS,EQ) \ 384 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \ 385 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 386 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 387 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 388 } 389 390 #if defined(PETSC_HAVE_COMPLEX) 391 #define DEF_ComplexType(Type,BS,EQ) \ 392 DEF_Pack(Type,BS,EQ) \ 393 DEF_Add(Type,BS,EQ) \ 394 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \ 395 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 396 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 397 } 398 #endif 399 400 #define DEF_DumbType(Type,BS,EQ) \ 401 DEF_Pack(Type,BS,EQ) \ 402 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \ 403 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 404 } 405 406 /* Maxloc, Minloc */ 407 #define DEF_PairType(Type1,Type2) \ 408 typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \ 409 DEF_Pack(PairType(Type1,Type2),1,1) \ 410 DEF_Xloc(Type1,Type2) \ 411 static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \ 412 CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \ 413 CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \ 414 } 415 416 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 417 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 418 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 419 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 420 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 421 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 422 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 423 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 424 425 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 426 DEF_IntegerType(int,1,1) 427 DEF_IntegerType(int,2,1) 428 DEF_IntegerType(int,4,1) 429 DEF_IntegerType(int,8,1) 430 DEF_IntegerType(int,1,0) 431 DEF_IntegerType(int,2,0) 432 DEF_IntegerType(int,4,0) 433 DEF_IntegerType(int,8,0) 434 #endif 435 436 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 437 typedef signed char SignedChar; 438 DEF_IntegerType(SignedChar,1,1) 439 DEF_IntegerType(SignedChar,2,1) 440 DEF_IntegerType(SignedChar,4,1) 441 DEF_IntegerType(SignedChar,8,1) 442 DEF_IntegerType(SignedChar,1,0) 443 DEF_IntegerType(SignedChar,2,0) 444 DEF_IntegerType(SignedChar,4,0) 445 DEF_IntegerType(SignedChar,8,0) 446 447 typedef unsigned char UnsignedChar; 448 DEF_IntegerType(UnsignedChar,1,1) 449 DEF_IntegerType(UnsignedChar,2,1) 450 DEF_IntegerType(UnsignedChar,4,1) 451 DEF_IntegerType(UnsignedChar,8,1) 452 DEF_IntegerType(UnsignedChar,1,0) 453 DEF_IntegerType(UnsignedChar,2,0) 454 DEF_IntegerType(UnsignedChar,4,0) 455 DEF_IntegerType(UnsignedChar,8,0) 456 457 DEF_RealType(PetscReal,1,1) 458 DEF_RealType(PetscReal,2,1) 459 DEF_RealType(PetscReal,4,1) 460 DEF_RealType(PetscReal,8,1) 461 DEF_RealType(PetscReal,1,0) 462 DEF_RealType(PetscReal,2,0) 463 DEF_RealType(PetscReal,4,0) 464 DEF_RealType(PetscReal,8,0) 465 466 #if defined(PETSC_HAVE_COMPLEX) 467 DEF_ComplexType(PetscComplex,1,1) 468 DEF_ComplexType(PetscComplex,2,1) 469 DEF_ComplexType(PetscComplex,4,1) 470 DEF_ComplexType(PetscComplex,8,1) 471 DEF_ComplexType(PetscComplex,1,0) 472 DEF_ComplexType(PetscComplex,2,0) 473 DEF_ComplexType(PetscComplex,4,0) 474 DEF_ComplexType(PetscComplex,8,0) 475 #endif 476 477 DEF_PairType(int,int) 478 DEF_PairType(PetscInt,PetscInt) 479 480 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 481 DEF_DumbType(char,1,1) 482 DEF_DumbType(char,2,1) 483 DEF_DumbType(char,4,1) 484 DEF_DumbType(char,1,0) 485 DEF_DumbType(char,2,0) 486 DEF_DumbType(char,4,0) 487 488 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 489 DEF_DumbType(DumbInt,1,1) 490 DEF_DumbType(DumbInt,2,1) 491 DEF_DumbType(DumbInt,4,1) 492 DEF_DumbType(DumbInt,8,1) 493 DEF_DumbType(DumbInt,1,0) 494 DEF_DumbType(DumbInt,2,0) 495 DEF_DumbType(DumbInt,4,0) 496 DEF_DumbType(DumbInt,8,0) 497 498 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 499 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 500 { 501 int ierr; 502 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 503 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 504 return MPI_SUCCESS; 505 } 506 #endif 507 508 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink) 509 { 510 PetscErrorCode ierr; 511 PetscSFPack link,*p; 512 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 513 514 PetscFunctionBegin; 515 /* Look for types in cache */ 516 for (p=&bas->inuse; (link=*p); p=&link->next) { 517 PetscBool match; 518 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 519 if (match && (rkey == link->rkey) && (lkey == link->lkey)) { 520 switch (cmode) { 521 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 522 case PETSC_USE_POINTER: break; 523 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 524 } 525 *mylink = link; 526 PetscFunctionReturn(0); 527 } 528 } 529 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 530 PetscFunctionReturn(0); 531 } 532 533 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 534 { 535 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 536 537 PetscFunctionBegin; 538 (*link)->rkey = NULL; 539 (*link)->lkey = NULL; 540 (*link)->next = bas->avail; 541 bas->avail = *link; 542 *link = NULL; 543 PetscFunctionReturn(0); 544 } 545 546 /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */ 547 PetscErrorCode PetscSFPackDestoryAvailable(PetscSFPack *avail) 548 { 549 PetscErrorCode ierr; 550 PetscSFPack link=*avail,next; 551 PetscInt i; 552 553 PetscFunctionBegin; 554 for (; link; link=next) { 555 next = link->next; 556 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 557 for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */ 558 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 559 } 560 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 561 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 562 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 563 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 564 565 #if defined(PETSC_HAVE_CUDA) 566 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 567 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 568 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 569 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 570 #endif 571 ierr = PetscFree(link);CHKERRQ(ierr); 572 } 573 *avail = NULL; 574 PetscFunctionReturn(0); 575 } 576 577 /* Error out on unsupported overlapped communications */ 578 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey) 579 { 580 PetscErrorCode ierr; 581 PetscSFPack link,*p; 582 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 583 PetscBool match; 584 585 PetscFunctionBegin; 586 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 587 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 588 */ 589 if (rkey || lkey) { 590 for (p=&bas->inuse; (link=*p); p=&link->next) { 591 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 592 if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey); 593 } 594 } 595 PetscFunctionReturn(0); 596 } 597 598 PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit) 599 { 600 PetscErrorCode ierr; 601 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 602 PetscBool is2Int,is2PetscInt; 603 PetscMPIInt ni,na,nd,combiner; 604 #if defined(PETSC_HAVE_COMPLEX) 605 PetscInt nPetscComplex=0; 606 #endif 607 608 PetscFunctionBegin; 609 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 610 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 611 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 612 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 613 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 614 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 615 #if defined(PETSC_HAVE_COMPLEX) 616 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 617 #endif 618 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 619 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 620 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 621 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 622 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; 623 link->bs = 1; /* default */ 624 625 if (is2Int) { 626 PackInit_PairType_int_int(link); 627 link->bs = 1; 628 link->unitbytes = 2*sizeof(int); 629 link->basicunit = MPI_2INT; 630 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 631 PackInit_PairType_PetscInt_PetscInt(link); 632 link->bs = 1; 633 link->unitbytes = 2*sizeof(PetscInt); 634 link->basicunit = MPIU_2INT; 635 } else if (nPetscReal) { 636 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 637 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 638 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 639 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 640 link->bs = nPetscReal; 641 link->unitbytes = nPetscReal*sizeof(PetscReal); 642 link->basicunit = MPIU_REAL; 643 } else if (nPetscInt) { 644 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 645 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 646 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 647 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 648 link->bs = nPetscInt; 649 link->unitbytes = nPetscInt*sizeof(PetscInt); 650 link->basicunit = MPIU_INT; 651 #if defined(PETSC_USE_64BIT_INDICES) 652 } else if (nInt) { 653 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 654 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 655 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 656 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 657 link->bs = nInt; 658 link->unitbytes = nInt*sizeof(int); 659 link->basicunit = MPI_INT; 660 #endif 661 } else if (nSignedChar) { 662 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 663 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 664 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 665 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 666 link->bs = nSignedChar; 667 link->unitbytes = nSignedChar*sizeof(SignedChar); 668 link->basicunit = MPI_SIGNED_CHAR; 669 } else if (nUnsignedChar) { 670 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 671 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 672 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 673 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 674 link->bs = nUnsignedChar; 675 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 676 link->basicunit = MPI_UNSIGNED_CHAR; 677 #if defined(PETSC_HAVE_COMPLEX) 678 } else if (nPetscComplex) { 679 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 680 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 681 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 682 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 683 link->bs = nPetscComplex; 684 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 685 link->basicunit = MPIU_COMPLEX; 686 #endif 687 } else { 688 MPI_Aint lb,nbyte; 689 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 690 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 691 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 692 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 693 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 694 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 695 link->bs = nbyte; 696 link->unitbytes = nbyte; 697 link->basicunit = MPI_BYTE; 698 } else { 699 nInt = nbyte / sizeof(int); 700 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 701 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 702 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 703 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 704 link->bs = nInt; 705 link->unitbytes = nbyte; 706 link->basicunit = MPI_INT; 707 } 708 } 709 710 if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */ 711 else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 712 PetscFunctionReturn(0); 713 } 714 715 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*)) 716 { 717 PetscFunctionBegin; 718 *UnpackAndOp = NULL; 719 if (mtype == PETSC_MEMTYPE_HOST) { 720 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 721 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 722 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 723 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 724 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 725 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 726 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 727 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 728 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 729 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 730 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 731 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 732 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 733 } 734 #if defined(PETSC_HAVE_CUDA) 735 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 736 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 737 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 738 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 739 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 740 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 741 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 742 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 743 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 744 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 745 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 746 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 747 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 748 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 749 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 750 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 751 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 752 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 753 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 754 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 755 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 756 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 757 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 758 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 759 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 760 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 761 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 762 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 763 } 764 #endif 765 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 766 767 PetscFunctionReturn(0); 768 } 769 770 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*)) 771 { 772 PetscFunctionBegin; 773 *FetchAndOp = NULL; 774 if (mtype == PETSC_MEMTYPE_HOST) { 775 if (op == MPIU_REPLACE) *FetchAndOp = link->h_FetchAndInsert; 776 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd; 777 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax; 778 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin; 779 else if (op == MPI_MAXLOC) *FetchAndOp = link->h_FetchAndMaxloc; 780 else if (op == MPI_MINLOC) *FetchAndOp = link->h_FetchAndMinloc; 781 else if (op == MPI_PROD) *FetchAndOp = link->h_FetchAndMult; 782 else if (op == MPI_LAND) *FetchAndOp = link->h_FetchAndLAND; 783 else if (op == MPI_BAND) *FetchAndOp = link->h_FetchAndBAND; 784 else if (op == MPI_LOR) *FetchAndOp = link->h_FetchAndLOR; 785 else if (op == MPI_BOR) *FetchAndOp = link->h_FetchAndBOR; 786 else if (op == MPI_LXOR) *FetchAndOp = link->h_FetchAndLXOR; 787 else if (op == MPI_BXOR) *FetchAndOp = link->h_FetchAndBXOR; 788 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 789 } 790 #if defined(PETSC_HAVE_CUDA) 791 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 792 if (op == MPIU_REPLACE) *FetchAndOp = link->d_FetchAndInsert; 793 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd; 794 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax; 795 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin; 796 else if (op == MPI_MAXLOC) *FetchAndOp = link->d_FetchAndMaxloc; 797 else if (op == MPI_MINLOC) *FetchAndOp = link->d_FetchAndMinloc; 798 else if (op == MPI_PROD) *FetchAndOp = link->d_FetchAndMult; 799 else if (op == MPI_LAND) *FetchAndOp = link->d_FetchAndLAND; 800 else if (op == MPI_BAND) *FetchAndOp = link->d_FetchAndBAND; 801 else if (op == MPI_LOR) *FetchAndOp = link->d_FetchAndLOR; 802 else if (op == MPI_BOR) *FetchAndOp = link->d_FetchAndBOR; 803 else if (op == MPI_LXOR) *FetchAndOp = link->d_FetchAndLXOR; 804 else if (op == MPI_BXOR) *FetchAndOp = link->d_FetchAndBXOR; 805 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 806 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 807 if (op == MPIU_REPLACE) *FetchAndOp = link->da_FetchAndInsert; 808 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd; 809 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax; 810 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin; 811 else if (op == MPI_MAXLOC) *FetchAndOp = link->da_FetchAndMaxloc; 812 else if (op == MPI_MINLOC) *FetchAndOp = link->da_FetchAndMinloc; 813 else if (op == MPI_PROD) *FetchAndOp = link->da_FetchAndMult; 814 else if (op == MPI_LAND) *FetchAndOp = link->da_FetchAndLAND; 815 else if (op == MPI_BAND) *FetchAndOp = link->da_FetchAndBAND; 816 else if (op == MPI_LOR) *FetchAndOp = link->da_FetchAndLOR; 817 else if (op == MPI_BOR) *FetchAndOp = link->da_FetchAndBOR; 818 else if (op == MPI_LXOR) *FetchAndOp = link->da_FetchAndLXOR; 819 else if (op == MPI_BXOR) *FetchAndOp = link->da_FetchAndBXOR; 820 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 821 } 822 #endif 823 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 824 PetscFunctionReturn(0); 825 } 826 827 /* 828 Create pack/unpack optimization plans based on indice patterns available 829 830 Input Parameters: 831 + n - Number of target ranks 832 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 833 - idx - [*] Array storing indices 834 835 Output Parameters: 836 + opt - Optimization plans. Maybe NULL if no optimization can be built. 837 */ 838 PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 839 { 840 PetscErrorCode ierr; 841 PetscInt i,j,k,n_copies,tot_copies=0,step; 842 PetscBool strided,optimized=PETSC_FALSE; 843 PetscSFPackOpt opt; 844 845 PetscFunctionBegin; 846 if (!n) { 847 *out = NULL; 848 PetscFunctionReturn(0); 849 } 850 851 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 852 ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 853 ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 854 if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/ 855 856 opt->n = n; 857 858 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 859 for (i=0; i<n; i++) { /* for each target processor */ 860 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 861 n_copies = 1; 862 for (j=offset[i]; j<offset[i+1]-1; j++) { 863 if (idx[j]+1 != idx[j+1]) n_copies++; 864 } 865 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 866 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 867 */ 868 if ((offset[i+1]-offset[i])/n_copies >= 32) { 869 opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 870 optimized = PETSC_TRUE; 871 tot_copies += n_copies; 872 } 873 } 874 875 /* Setup memcpy plan for each contiguous piece */ 876 k = 0; /* k-th copy */ 877 ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 878 for (i=0; i<n; i++) { /* for each target processor */ 879 if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 880 n_copies = 1; 881 opt->copy_start[k] = offset[i] - offset[0]; 882 for (j=offset[i]; j<offset[i+1]-1; j++) { 883 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 884 n_copies++; 885 opt->copy_start[k+1] = j-offset[0]+1; 886 opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 887 k++; 888 } 889 } 890 /* Set copy length of the last copy for this target */ 891 opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 892 k++; 893 } 894 /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 895 opt->copy_offset[i+1] = k; 896 } 897 898 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 899 for (i=0; i<n; i++) { /* for each remote */ 900 if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 901 strided = PETSC_TRUE; 902 step = idx[offset[i]+1] - idx[offset[i]]; 903 for (j=offset[i]; j<offset[i+1]-1; j++) { 904 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 905 } 906 if (strided) { 907 opt->type[i] = PETSCSF_PACKOPT_STRIDE; 908 opt->stride_step[i] = step; 909 opt->stride_n[i] = offset[i+1] - offset[i]; 910 optimized = PETSC_TRUE; 911 } 912 } 913 } 914 /* If no rank gets optimized, free arrays to save memory */ 915 if (!optimized) { 916 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 917 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 918 ierr = PetscFree(opt);CHKERRQ(ierr); 919 *out = NULL; 920 } else *out = opt; 921 PetscFunctionReturn(0); 922 } 923 924 PetscErrorCode PetscSFPackOptDestory(PetscSFPackOpt *out) 925 { 926 PetscErrorCode ierr; 927 PetscSFPackOpt opt = *out; 928 929 PetscFunctionBegin; 930 if (opt) { 931 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 932 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 933 ierr = PetscFree(opt);CHKERRQ(ierr); 934 *out = NULL; 935 } 936 PetscFunctionReturn(0); 937 } 938