1 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> 7 #endif 8 /* 9 * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 10 * therefore we pack data types manually. This file defines packing routines for the standard data types. 11 */ 12 13 #define CPPJoin2(a,b) a ##_## b 14 #define CPPJoin3(a,b,c) a ##_## b ##_## c 15 #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d 16 #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d) 17 18 #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 19 #define IGNORE(statement) do {} while(0) 20 21 #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 22 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 23 #define LXOR_OP(r,s,op,t) do {(r) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 24 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 25 26 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */ 27 28 /* DEF_PackFunc - macro defining a Pack routine 29 30 Arguments of the macro: 31 +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 32 .BS Block size for vectorization. It is a factor of bs. 33 -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 34 35 Arguments of the Pack routine: 36 +count Number of indices in idx[] 37 .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count) 38 .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39 .opt Pack optimization plans. NULL means no plan at all. 40 .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count) 41 -packed Address of the packed data for each rank 42 */ 43 #define DEF_PackFunc(Type,BS,EQ) \ 44 static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45 { \ 46 PetscErrorCode ierr; \ 47 const Type *u = (const Type*)unpacked,*u2; \ 48 Type *p = (Type*)packed,*p2; \ 49 PetscInt i,j,k,l,r,step,bs=link->bs; \ 50 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 52 PetscFunctionBegin; \ 53 if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54 else if (!opt) { /* No optimizations available */ \ 55 for (i=0; i<count; i++) \ 56 for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57 for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58 p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59 } else { \ 60 for (r=0; r<opt->n; r++) { \ 61 p2 = p + opt->offset[r]*MBS; \ 62 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63 idx2 = idx + opt->offset[r]; \ 64 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65 for (j=0; j<M; j++) \ 66 for (k=0; k<BS; k++) \ 67 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 69 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70 u2 = u + idx[opt->copy_start[i]]*MBS; \ 71 l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72 ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73 p2 += l; \ 74 } \ 75 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76 u2 = u + idx[opt->offset[r]]*MBS; \ 77 step = opt->stride_step[r]; \ 78 for (i=0; i<opt->stride_n[r]; i++) \ 79 for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 81 } \ 82 } \ 83 PetscFunctionReturn(0); \ 84 } 85 86 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 87 88 Arguments: 89 +action Unpack or Fetch 90 .Type Type of the data 91 .BS Block size for vectorization 92 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 93 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 94 .CType Type with or without the const qualifier, i.e., const Type or Type 95 .Cvoid void with or without the const qualifier, i.e., const void or void 96 97 Notes: 98 This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 99 The two arguments CType and Cvoid are used (instead of one constness argument), because we want to 100 get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 101 sometimes we input 'const', sometimes we have to input empty. 102 103 If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare. 104 */ 105 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \ 106 static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 107 { \ 108 PetscErrorCode ierr; \ 109 Type *u = (Type*)unpacked,*u2; \ 110 CType *p = (CType*)packed,*p2; \ 111 PetscInt i,j,k,l,r,step,bs=link->bs; \ 112 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 113 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 114 PetscFunctionBegin; \ 115 if (!idx) { \ 116 FILTER(Type *v); \ 117 FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \ 118 FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \ 119 ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr); \ 120 FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \ 121 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 122 } else if (!opt) { /* No optimizations available */ \ 123 for (i=0; i<count; i++) \ 124 for (j=0; j<M; j++) \ 125 for (k=0; k<BS; k++) { \ 126 FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \ 127 u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 128 FILTER(p[i*MBS+j*BS+k] = t); \ 129 } \ 130 } else { \ 131 for (r=0; r<opt->n; r++) { \ 132 p2 = p + opt->offset[r]*MBS; \ 133 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 134 idx2 = idx + opt->offset[r]; \ 135 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 136 for (j=0; j<M; j++) \ 137 for (k=0; k<BS; k++) { \ 138 FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \ 139 u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 140 FILTER(p2[i*MBS+j*BS+k] = t); \ 141 } \ 142 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 143 FILTER(Type *v); \ 144 FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \ 145 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 146 u2 = u + idx[opt->copy_start[i]]*MBS; \ 147 l = opt->copy_length[i]*MBS; \ 148 FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 149 ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 150 FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \ 151 p2 += l; \ 152 } \ 153 FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 154 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 155 u2 = u + idx[opt->offset[r]]*MBS; \ 156 step = opt->stride_step[r]; \ 157 for (i=0; i<opt->stride_n[r]; i++) \ 158 for (j=0; j<M; j++) \ 159 for (k=0; k<BS; k++) { \ 160 FILTER(Type t = u2[i*step*MBS+j*BS+k]); \ 161 u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 162 FILTER(p2[i*MBS+j*BS+k] = t); \ 163 } \ 164 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 165 } \ 166 } \ 167 PetscFunctionReturn(0); \ 168 } 169 170 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 171 172 Arguments: 173 +action Unpack or Fetch 174 .opname Name of the Op, such as Add, Mult, LAND, etc. 175 .Type Type of the data 176 .BS Block size for vectorization 177 .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 178 .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 179 .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 180 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 181 .CType Type with or without the const qualifier, i.e., const Type or Type 182 -Cvoid void with or without the const qualifier, i.e., const void or void 183 */ 184 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \ 185 static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 186 { \ 187 Type *u = (Type*)unpacked,*u2,t; \ 188 CType *p = (CType*)packed,*p2; \ 189 PetscInt i,j,k,l,r,step,bs=link->bs; \ 190 const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 191 const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 192 PetscFunctionBegin; \ 193 if (!idx) { \ 194 for (i=0; i<count; i++) \ 195 for (j=0; j<M; j++) \ 196 for (k=0; k<BS; k++) { \ 197 t = u[i*MBS+j*BS+k]; \ 198 APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 199 FILTER(p[i*MBS+j*BS+k] = t); \ 200 } \ 201 } else if (!opt) { /* No optimizations available */ \ 202 for (i=0; i<count; i++) \ 203 for (j=0; j<M; j++) \ 204 for (k=0; k<BS; k++) { \ 205 t = u[idx[i]*MBS+j*BS+k]; \ 206 APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 207 FILTER(p[i*MBS+j*BS+k] = t); \ 208 } \ 209 } else { \ 210 for (r=0; r<opt->n; r++) { \ 211 p2 = p + opt->offset[r]*MBS; \ 212 if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 213 idx2 = idx + opt->offset[r]; \ 214 for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 215 for (j=0; j<M; j++) \ 216 for (k=0; k<BS; k++) { \ 217 t = u[idx2[i]*MBS+j*BS+k]; \ 218 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 219 FILTER(p2[i*MBS+j*BS+k] = t); \ 220 } \ 221 } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 222 for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 223 u2 = u + idx[opt->copy_start[i]]*MBS; \ 224 l = opt->copy_length[i]*MBS; \ 225 for (j=0; j<l; j++) { \ 226 t = u2[j]; \ 227 APPLY (u2[j],t,op,p2[j]); \ 228 FILTER(p2[j] = t); \ 229 } \ 230 p2 += l; \ 231 } \ 232 } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 233 u2 = u + idx[opt->offset[r]]*MBS; \ 234 step = opt->stride_step[r]; \ 235 for (i=0; i<opt->stride_n[r]; i++) \ 236 for (j=0; j<M; j++) \ 237 for (k=0; k<BS; k++) { \ 238 t = u2[i*step*MBS+j*BS+k]; \ 239 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 240 FILTER(p2[i*MBS+j*BS+k] = t); \ 241 } \ 242 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 243 } \ 244 } \ 245 PetscFunctionReturn(0); \ 246 } 247 248 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 249 250 Arguments: 251 +Action Unpack or Fetch 252 .locname Max or Min 253 .type1 Type of the first data in a pair type 254 .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 255 .op > or < 256 .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 257 .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2) 258 -Cvoid void with or without the const qualifier, i.e., const void or void 259 */ 260 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \ 261 static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \ 262 PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \ 263 CType *p = (CType*)packed; \ 264 PetscInt i,j; \ 265 for (i=0; i<count; i++) { \ 266 FILTER(PairType(Type1,Type2) v); \ 267 j = idx? idx[i] : i; \ 268 FILTER(v = u[j]); \ 269 if (p[i].a op u[j].a) { \ 270 u[j] = p[i]; \ 271 } else if (p[i].a == u[j].a) { \ 272 u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 273 } \ 274 FILTER(p[i] = v); \ 275 } \ 276 PetscFunctionReturn(0); \ 277 } 278 279 /* Pack, Unpack/Fetch ops */ 280 #define DEF_Pack(Type,BS,EQ) \ 281 DEF_PackFunc(Type,BS,EQ) \ 282 DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \ 283 DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \ 284 static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \ 285 link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 286 link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 287 link->h_FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \ 288 } 289 290 /* Add, Mult ops */ 291 #define DEF_Add(Type,BS,EQ) \ 292 DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \ 293 DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \ 294 DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \ 295 DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \ 296 static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \ 297 link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 298 link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 299 link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 300 link->h_FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \ 301 } 302 303 /* Max, Min ops */ 304 #define DEF_Cmp(Type,BS,EQ) \ 305 DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \ 306 DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \ 307 DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \ 308 DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \ 309 static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \ 310 link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 311 link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 312 link->h_FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \ 313 link->h_FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \ 314 } 315 316 /* Logical ops. 317 The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 318 the compilation warning "empty macro arguments are undefined in ISO C90" 319 */ 320 #define DEF_Log(Type,BS,EQ) \ 321 DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \ 322 DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \ 323 DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \ 324 DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \ 325 DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \ 326 DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \ 327 static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \ 328 link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \ 329 link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 330 link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \ 331 link->h_FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \ 332 link->h_FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \ 333 link->h_FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \ 334 } 335 336 /* Bitwise ops */ 337 #define DEF_Bit(Type,BS,EQ) \ 338 DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \ 339 DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \ 340 DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \ 341 DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \ 342 DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \ 343 DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \ 344 static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \ 345 link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \ 346 link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 347 link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \ 348 link->h_FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \ 349 link->h_FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \ 350 link->h_FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \ 351 } 352 353 /* Maxloc, Minloc */ 354 #define DEF_Xloc(Type1,Type2) \ 355 DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \ 356 DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \ 357 DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \ 358 DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \ 359 static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \ 360 link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \ 361 link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \ 362 link->h_FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \ 363 link->h_FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \ 364 } 365 366 #define DEF_IntegerType(Type,BS,EQ) \ 367 DEF_Pack(Type,BS,EQ) \ 368 DEF_Add(Type,BS,EQ) \ 369 DEF_Cmp(Type,BS,EQ) \ 370 DEF_Log(Type,BS,EQ) \ 371 DEF_Bit(Type,BS,EQ) \ 372 static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \ 373 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 375 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 376 CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 377 CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 378 } 379 380 #define DEF_RealType(Type,BS,EQ) \ 381 DEF_Pack(Type,BS,EQ) \ 382 DEF_Add(Type,BS,EQ) \ 383 DEF_Cmp(Type,BS,EQ) \ 384 static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \ 385 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 386 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 387 CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 388 } 389 390 #if defined(PETSC_HAVE_COMPLEX) 391 #define DEF_ComplexType(Type,BS,EQ) \ 392 DEF_Pack(Type,BS,EQ) \ 393 DEF_Add(Type,BS,EQ) \ 394 static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \ 395 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 396 CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 397 } 398 #endif 399 400 #define DEF_DumbType(Type,BS,EQ) \ 401 DEF_Pack(Type,BS,EQ) \ 402 static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \ 403 CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 404 } 405 406 /* Maxloc, Minloc */ 407 #define DEF_PairType(Type1,Type2) \ 408 typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \ 409 DEF_Pack(PairType(Type1,Type2),1,1) \ 410 DEF_Xloc(Type1,Type2) \ 411 static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \ 412 CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \ 413 CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \ 414 } 415 416 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 417 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 418 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 419 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 420 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 421 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 422 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 423 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 424 425 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 426 DEF_IntegerType(int,1,1) 427 DEF_IntegerType(int,2,1) 428 DEF_IntegerType(int,4,1) 429 DEF_IntegerType(int,8,1) 430 DEF_IntegerType(int,1,0) 431 DEF_IntegerType(int,2,0) 432 DEF_IntegerType(int,4,0) 433 DEF_IntegerType(int,8,0) 434 #endif 435 436 /* The typedefs are used to get a typename without space that CPPJoin can handle */ 437 typedef signed char SignedChar; 438 DEF_IntegerType(SignedChar,1,1) 439 DEF_IntegerType(SignedChar,2,1) 440 DEF_IntegerType(SignedChar,4,1) 441 DEF_IntegerType(SignedChar,8,1) 442 DEF_IntegerType(SignedChar,1,0) 443 DEF_IntegerType(SignedChar,2,0) 444 DEF_IntegerType(SignedChar,4,0) 445 DEF_IntegerType(SignedChar,8,0) 446 447 typedef unsigned char UnsignedChar; 448 DEF_IntegerType(UnsignedChar,1,1) 449 DEF_IntegerType(UnsignedChar,2,1) 450 DEF_IntegerType(UnsignedChar,4,1) 451 DEF_IntegerType(UnsignedChar,8,1) 452 DEF_IntegerType(UnsignedChar,1,0) 453 DEF_IntegerType(UnsignedChar,2,0) 454 DEF_IntegerType(UnsignedChar,4,0) 455 DEF_IntegerType(UnsignedChar,8,0) 456 457 DEF_RealType(PetscReal,1,1) 458 DEF_RealType(PetscReal,2,1) 459 DEF_RealType(PetscReal,4,1) 460 DEF_RealType(PetscReal,8,1) 461 DEF_RealType(PetscReal,1,0) 462 DEF_RealType(PetscReal,2,0) 463 DEF_RealType(PetscReal,4,0) 464 DEF_RealType(PetscReal,8,0) 465 466 #if defined(PETSC_HAVE_COMPLEX) 467 DEF_ComplexType(PetscComplex,1,1) 468 DEF_ComplexType(PetscComplex,2,1) 469 DEF_ComplexType(PetscComplex,4,1) 470 DEF_ComplexType(PetscComplex,8,1) 471 DEF_ComplexType(PetscComplex,1,0) 472 DEF_ComplexType(PetscComplex,2,0) 473 DEF_ComplexType(PetscComplex,4,0) 474 DEF_ComplexType(PetscComplex,8,0) 475 #endif 476 477 DEF_PairType(int,int) 478 DEF_PairType(PetscInt,PetscInt) 479 480 /* If we don't know the basic type, we treat it as a stream of chars or ints */ 481 DEF_DumbType(char,1,1) 482 DEF_DumbType(char,2,1) 483 DEF_DumbType(char,4,1) 484 DEF_DumbType(char,1,0) 485 DEF_DumbType(char,2,0) 486 DEF_DumbType(char,4,0) 487 488 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 489 DEF_DumbType(DumbInt,1,1) 490 DEF_DumbType(DumbInt,2,1) 491 DEF_DumbType(DumbInt,4,1) 492 DEF_DumbType(DumbInt,8,1) 493 DEF_DumbType(DumbInt,1,0) 494 DEF_DumbType(DumbInt,2,0) 495 DEF_DumbType(DumbInt,4,0) 496 DEF_DumbType(DumbInt,8,0) 497 498 #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 499 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 500 { 501 int ierr; 502 ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 503 ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 504 return MPI_SUCCESS; 505 } 506 #endif 507 508 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFPack *mylink) 509 { 510 PetscErrorCode ierr; 511 PetscSFPack link,*p; 512 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 513 514 PetscFunctionBegin; 515 /* Look for types in cache */ 516 for (p=&bas->inuse; (link=*p); p=&link->next) { 517 PetscBool match; 518 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 519 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 520 switch (cmode) { 521 case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 522 case PETSC_USE_POINTER: break; 523 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 524 } 525 *mylink = link; 526 PetscFunctionReturn(0); 527 } 528 } 529 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 530 PetscFunctionReturn(0); 531 } 532 533 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 534 { 535 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 536 537 PetscFunctionBegin; 538 (*link)->rootdata = NULL; 539 (*link)->leafdata = NULL; 540 (*link)->next = bas->avail; 541 bas->avail = *link; 542 *link = NULL; 543 PetscFunctionReturn(0); 544 } 545 546 /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */ 547 PetscErrorCode PetscSFPackDestroyAvailable(PetscSF sf,PetscSFPack *avail) 548 { 549 PetscErrorCode ierr; 550 PetscSFPack link=*avail,next; 551 PetscInt i; 552 553 PetscFunctionBegin; 554 for (; link; link=next) { 555 next = link->next; 556 if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 557 for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */ 558 if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 559 } 560 ierr = PetscFree(link->reqs);CHKERRQ(ierr); 561 562 #if defined(PETSC_HAVE_CUDA) 563 if (!use_gpu_aware_mpi && sf->use_pinned_buf) { /* In case the buffers are allocated specially */ 564 if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscFreePinnedMemory(link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);} 565 if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscFreePinnedMemory(link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);} 566 } 567 #endif 568 569 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 570 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 571 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 572 573 #if defined(PETSC_HAVE_CUDA) 574 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 575 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 576 ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 577 if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 578 #endif 579 ierr = PetscFree(link);CHKERRQ(ierr); 580 } 581 *avail = NULL; 582 PetscFunctionReturn(0); 583 } 584 585 /* Error out on unsupported overlapped communications */ 586 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 587 { 588 PetscErrorCode ierr; 589 PetscSFPack link,*p; 590 PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 591 PetscBool match; 592 593 PetscFunctionBegin; 594 /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 595 the potential overlapping since this process does not participate in communication. Overlapping is harmless. 596 */ 597 if (rootdata || leafdata) { 598 for (p=&bas->inuse; (link=*p); p=&link->next) { 599 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 600 if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rootdata,leafdata); 601 } 602 } 603 PetscFunctionReturn(0); 604 } 605 606 PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit) 607 { 608 PetscErrorCode ierr; 609 PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 610 PetscBool is2Int,is2PetscInt; 611 PetscMPIInt ni,na,nd,combiner; 612 #if defined(PETSC_HAVE_COMPLEX) 613 PetscInt nPetscComplex=0; 614 #endif 615 616 PetscFunctionBegin; 617 ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 618 ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 619 /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 620 ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 621 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 622 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 623 #if defined(PETSC_HAVE_COMPLEX) 624 ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 625 #endif 626 ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 627 ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 628 /* TODO: shaell we also handle Fortran MPI_2REAL? */ 629 ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 630 link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 631 link->bs = 1; /* default */ 632 633 if (is2Int) { 634 PackInit_PairType_int_int(link); 635 link->bs = 1; 636 link->unitbytes = 2*sizeof(int); 637 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 638 link->basicunit = MPI_2INT; 639 link->unit = MPI_2INT; 640 } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 641 PackInit_PairType_PetscInt_PetscInt(link); 642 link->bs = 1; 643 link->unitbytes = 2*sizeof(PetscInt); 644 link->basicunit = MPIU_2INT; 645 link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 646 link->unit = MPIU_2INT; 647 } else if (nPetscReal) { 648 if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 649 else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 650 else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 651 else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 652 link->bs = nPetscReal; 653 link->unitbytes = nPetscReal*sizeof(PetscReal); 654 link->basicunit = MPIU_REAL; 655 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 656 } else if (nPetscInt) { 657 if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 658 else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 659 else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 660 else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 661 link->bs = nPetscInt; 662 link->unitbytes = nPetscInt*sizeof(PetscInt); 663 link->basicunit = MPIU_INT; 664 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 665 #if defined(PETSC_USE_64BIT_INDICES) 666 } else if (nInt) { 667 if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 668 else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 669 else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 670 else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 671 link->bs = nInt; 672 link->unitbytes = nInt*sizeof(int); 673 link->basicunit = MPI_INT; 674 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 675 #endif 676 } else if (nSignedChar) { 677 if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 678 else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 679 else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 680 else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 681 link->bs = nSignedChar; 682 link->unitbytes = nSignedChar*sizeof(SignedChar); 683 link->basicunit = MPI_SIGNED_CHAR; 684 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 685 } else if (nUnsignedChar) { 686 if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 687 else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 688 else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 689 else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 690 link->bs = nUnsignedChar; 691 link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 692 link->basicunit = MPI_UNSIGNED_CHAR; 693 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 694 #if defined(PETSC_HAVE_COMPLEX) 695 } else if (nPetscComplex) { 696 if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 697 else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 698 else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 699 else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 700 link->bs = nPetscComplex; 701 link->unitbytes = nPetscComplex*sizeof(PetscComplex); 702 link->basicunit = MPIU_COMPLEX; 703 if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 704 #endif 705 } else { 706 MPI_Aint lb,nbyte; 707 ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 708 if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 709 if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 710 if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 711 else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 712 else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 713 link->bs = nbyte; 714 link->unitbytes = nbyte; 715 link->basicunit = MPI_BYTE; 716 } else { 717 nInt = nbyte / sizeof(int); 718 if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 719 else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 720 else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 721 else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 722 link->bs = nInt; 723 link->unitbytes = nbyte; 724 link->basicunit = MPI_INT; 725 } 726 if (link->isbuiltin) link->unit = unit; 727 } 728 729 if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 730 PetscFunctionReturn(0); 731 } 732 733 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*)) 734 { 735 PetscFunctionBegin; 736 *UnpackAndOp = NULL; 737 if (mtype == PETSC_MEMTYPE_HOST) { 738 if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 739 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 740 else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 741 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 742 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 743 else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 744 else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 745 else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 746 else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 747 else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 748 else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 749 else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 750 else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 751 } 752 #if defined(PETSC_HAVE_CUDA) 753 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 754 if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 755 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 756 else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 757 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 758 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 759 else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 760 else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 761 else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 762 else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 763 else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 764 else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 765 else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 766 else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 767 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 768 if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 769 else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 770 else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 771 else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 772 else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 773 else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 774 else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 775 else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 776 else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 777 else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 778 else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 779 else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 780 else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 781 } 782 #endif 783 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 784 785 PetscFunctionReturn(0); 786 } 787 788 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*)) 789 { 790 PetscFunctionBegin; 791 *FetchAndOp = NULL; 792 if (mtype == PETSC_MEMTYPE_HOST) { 793 if (op == MPIU_REPLACE) *FetchAndOp = link->h_FetchAndInsert; 794 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd; 795 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax; 796 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin; 797 else if (op == MPI_MAXLOC) *FetchAndOp = link->h_FetchAndMaxloc; 798 else if (op == MPI_MINLOC) *FetchAndOp = link->h_FetchAndMinloc; 799 else if (op == MPI_PROD) *FetchAndOp = link->h_FetchAndMult; 800 else if (op == MPI_LAND) *FetchAndOp = link->h_FetchAndLAND; 801 else if (op == MPI_BAND) *FetchAndOp = link->h_FetchAndBAND; 802 else if (op == MPI_LOR) *FetchAndOp = link->h_FetchAndLOR; 803 else if (op == MPI_BOR) *FetchAndOp = link->h_FetchAndBOR; 804 else if (op == MPI_LXOR) *FetchAndOp = link->h_FetchAndLXOR; 805 else if (op == MPI_BXOR) *FetchAndOp = link->h_FetchAndBXOR; 806 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 807 } 808 #if defined(PETSC_HAVE_CUDA) 809 else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 810 if (op == MPIU_REPLACE) *FetchAndOp = link->d_FetchAndInsert; 811 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd; 812 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax; 813 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin; 814 else if (op == MPI_MAXLOC) *FetchAndOp = link->d_FetchAndMaxloc; 815 else if (op == MPI_MINLOC) *FetchAndOp = link->d_FetchAndMinloc; 816 else if (op == MPI_PROD) *FetchAndOp = link->d_FetchAndMult; 817 else if (op == MPI_LAND) *FetchAndOp = link->d_FetchAndLAND; 818 else if (op == MPI_BAND) *FetchAndOp = link->d_FetchAndBAND; 819 else if (op == MPI_LOR) *FetchAndOp = link->d_FetchAndLOR; 820 else if (op == MPI_BOR) *FetchAndOp = link->d_FetchAndBOR; 821 else if (op == MPI_LXOR) *FetchAndOp = link->d_FetchAndLXOR; 822 else if (op == MPI_BXOR) *FetchAndOp = link->d_FetchAndBXOR; 823 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 824 } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 825 if (op == MPIU_REPLACE) *FetchAndOp = link->da_FetchAndInsert; 826 else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd; 827 else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax; 828 else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin; 829 else if (op == MPI_MAXLOC) *FetchAndOp = link->da_FetchAndMaxloc; 830 else if (op == MPI_MINLOC) *FetchAndOp = link->da_FetchAndMinloc; 831 else if (op == MPI_PROD) *FetchAndOp = link->da_FetchAndMult; 832 else if (op == MPI_LAND) *FetchAndOp = link->da_FetchAndLAND; 833 else if (op == MPI_BAND) *FetchAndOp = link->da_FetchAndBAND; 834 else if (op == MPI_LOR) *FetchAndOp = link->da_FetchAndLOR; 835 else if (op == MPI_BOR) *FetchAndOp = link->da_FetchAndBOR; 836 else if (op == MPI_LXOR) *FetchAndOp = link->da_FetchAndLXOR; 837 else if (op == MPI_BXOR) *FetchAndOp = link->da_FetchAndBXOR; 838 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 839 } 840 #endif 841 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 842 PetscFunctionReturn(0); 843 } 844 845 /* 846 Create pack/unpack optimization plans based on indice patterns available 847 848 Input Parameters: 849 + n - Number of target ranks 850 . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 851 - idx - [*] Array storing indices 852 853 Output Parameters: 854 + opt - Optimization plans. Maybe NULL if no optimization can be built. 855 */ 856 PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 857 { 858 PetscErrorCode ierr; 859 PetscInt i,j,k,n_copies,tot_copies=0,step; 860 PetscBool strided,optimized=PETSC_FALSE; 861 PetscSFPackOpt opt; 862 863 PetscFunctionBegin; 864 if (!n) { 865 *out = NULL; 866 PetscFunctionReturn(0); 867 } 868 869 ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 870 ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 871 ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 872 if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/ 873 874 opt->n = n; 875 876 /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */ 877 for (i=0; i<n; i++) { /* for each target processor */ 878 /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 879 n_copies = 1; 880 for (j=offset[i]; j<offset[i+1]-1; j++) { 881 if (idx[j]+1 != idx[j+1]) n_copies++; 882 } 883 /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 884 then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 885 */ 886 if ((offset[i+1]-offset[i])/n_copies >= 32) { 887 opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 888 optimized = PETSC_TRUE; 889 tot_copies += n_copies; 890 } 891 } 892 893 /* Setup memcpy plan for each contiguous piece */ 894 k = 0; /* k-th copy */ 895 ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 896 for (i=0; i<n; i++) { /* for each target processor */ 897 if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 898 n_copies = 1; 899 opt->copy_start[k] = offset[i] - offset[0]; 900 for (j=offset[i]; j<offset[i+1]-1; j++) { 901 if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 902 n_copies++; 903 opt->copy_start[k+1] = j-offset[0]+1; 904 opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 905 k++; 906 } 907 } 908 /* Set copy length of the last copy for this target */ 909 opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 910 k++; 911 } 912 /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 913 opt->copy_offset[i+1] = k; 914 } 915 916 /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 917 for (i=0; i<n; i++) { /* for each remote */ 918 if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 919 strided = PETSC_TRUE; 920 step = idx[offset[i]+1] - idx[offset[i]]; 921 for (j=offset[i]; j<offset[i+1]-1; j++) { 922 if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 923 } 924 if (strided) { 925 opt->type[i] = PETSCSF_PACKOPT_STRIDE; 926 opt->stride_step[i] = step; 927 opt->stride_n[i] = offset[i+1] - offset[i]; 928 optimized = PETSC_TRUE; 929 } 930 } 931 } 932 /* If no rank gets optimized, free arrays to save memory */ 933 if (!optimized) { 934 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 935 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 936 ierr = PetscFree(opt);CHKERRQ(ierr); 937 *out = NULL; 938 } else *out = opt; 939 PetscFunctionReturn(0); 940 } 941 942 PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out) 943 { 944 PetscErrorCode ierr; 945 PetscSFPackOpt opt = *out; 946 947 PetscFunctionBegin; 948 if (opt) { 949 ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 950 ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 951 ierr = PetscFree(opt);CHKERRQ(ierr); 952 *out = NULL; 953 } 954 PetscFunctionReturn(0); 955 } 956