xref: /petsc/config/petsc_harness.sh (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1
2
3scriptname=`basename $0`
4rundir=${scriptname%.sh}
5TIMEOUT=60
6
7if test "$PWD"!=`dirname $0`; then
8  cd `dirname $0`
9  abspath_scriptdir=$PWD
10fi
11if test -d "${rundir}" && test -n "${rundir}"; then
12  rm -f ${rundir}/*.tmp ${rundir}/*.err ${rundir}/*.out
13fi
14mkdir -p ${rundir}
15if test -n "${runfiles}"; then
16  for runfile in ${runfiles}; do
17      subdir=`dirname ${runfile}`
18      mkdir -p ${rundir}/${subdir}
19      cp -r ${runfile} ${rundir}/${subdir}
20  done
21fi
22cd ${rundir}
23
24#
25# Method to print out general and script specific options
26#
27print_usage() {
28
29cat >&2 <<EOF
30Usage: $0 [options]
31
32OPTIONS
33  -a <args> ......... Override default arguments
34  -c ................ Cleanup (remove generated files)
35  -C ................ Compile
36  -d ................ Launch in debugger
37  -e <args> ......... Add extra arguments to default
38  -f ................ force attempt to run test that would otherwise be skipped
39  -h ................ help: print this message
40  -n <integer> ...... Override the number of processors to use
41  -j ................ Pass -j to petscdiff (just use diff)
42  -J <arg> .......... Pass -J to petscdiff (just use diff with arg)
43  -m ................ Update results using petscdiff
44  -M ................ Update alt files using petscdiff
45  -o <arg> .......... Output format: 'interactive', 'err_only'
46  -p ................ Print command:  Print first command and exit
47  -t ................ Override the default timeout (default=$TIMEOUT sec)
48  -U ................ run cUda-memcheck
49  -V ................ run Valgrind
50  -v ................ Verbose: Print commands
51EOF
52
53  if declare -f extrausage > /dev/null; then extrausage; fi
54  exit $1
55}
56###
57##  Arguments for overriding things
58#
59output_fmt="interactive"
60verbose=false
61cleanup=false
62compile=false
63debugger=false
64printcmd=false
65mpiexec_function=false
66force=false
67diff_flags=""
68while getopts "a:cCde:fhjJ:mMn:o:pt:UvV" arg
69do
70  case $arg in
71    a ) args="$OPTARG"       ;;
72    c ) cleanup=true         ;;
73    C ) compile=true         ;;
74    d ) debugger=true        ;;
75    e ) extra_args="$OPTARG" ;;
76    f ) force=true           ;;
77    h ) print_usage; exit    ;;
78    n ) nsize="$OPTARG"      ;;
79    j ) diff_flags=$diff_flags" -j"      ;;
80    J ) diff_flags=$diff_flags" -J $OPTARG" ;;
81    m ) diff_flags=$diff_flags" -m"      ;;
82    M ) diff_flags=$diff_flags" -M"      ;;
83    o ) output_fmt=$OPTARG   ;;
84    p ) printcmd=true        ;;
85    t ) TIMEOUT=$OPTARG      ;;
86    U ) mpiexec="petsc_mpiexec_cudamemcheck $mpiexec"
87        mpiexec_function=true
88        ;;
89    V ) mpiexec="petsc_mpiexec_valgrind $mpiexec"
90        mpiexec_function=true
91        ;;
92    v ) verbose=true         ;;
93    *)  # To take care of any extra args
94      if test -n "$OPTARG"; then
95        eval $arg=\"$OPTARG\"
96      else
97        eval $arg=found
98      fi
99      ;;
100  esac
101done
102shift $(( $OPTIND - 1 ))
103
104# Individual tests can extend the default
105export MPIEXEC_TIMEOUT=$((TIMEOUT*timeoutfactor))
106STARTTIME=`date +%s`
107
108if test -n "$extra_args"; then
109  args="$args $extra_args"
110fi
111if $debugger; then
112  args="-start_in_debugger $args"
113fi
114if test -n "$filter"; then
115  diff_flags=$diff_flags" -F \$'$filter'"
116fi
117if test -n "$filter_output"; then
118  diff_flags=$diff_flags" -f \$'$filter_output'"
119fi
120
121
122# Init
123success=0; failed=0; failures=""; rmfiles=""
124total=0
125todo=-1; skip=-1
126job_level=0
127
128if $compile; then
129   curexec=`basename ${exec}`
130   fullexec=${abspath_scriptdir}/${curexec}
131   maketarget=`echo ${fullexec} | sed "s#${petsc_dir}/*##"`
132   (cd $petsc_dir && make -f gmakefile.test ${maketarget})
133fi
134
135###
136##   Rest of code is functions
137#
138function petsc_report_tapoutput() {
139  notornot=$1
140  test_label=$2
141  comment=$3
142  if test -n "$comment"; then
143    comment=" # ${comment}"
144  fi
145
146  tap_message="${notornot} ok ${test_label}${comment}"
147
148  # Log messages
149  printf "${tap_message}\n" >> ${testlogtapfile}
150
151  if test ${output_fmt} == "err_only"; then
152     if test -n "${notornot}"; then
153        printf "${tap_message}\n" | tee -a ${testlogerrfile}
154     fi
155  else
156     printf "${tap_message}\n"
157  fi
158}
159
160function printcmd() {
161  # Print command that can be run from PETSC_DIR
162  cmd="$1"
163  basedir=`dirname ${PWD} | sed "s#${petsc_dir}/##"`
164  modcmd=`echo ${cmd} | sed -e "s#\.\.#${basedir}#" | sed s#\>.*## | sed s#\%#\%\%#`
165  if $mpiexec_function; then
166     # Have to expand valgrind/cudamemchk
167     modcmd=`eval "$modcmd"`
168  fi
169  printf "${modcmd}\n"
170  exit
171}
172
173function petsc_testrun() {
174  # First arg = Basic command
175  # Second arg = stdout file
176  # Third arg = stderr file
177  # Fourth arg = label for reporting
178  rmfiles="${rmfiles} $2 $3"
179  tlabel=$4
180  error=$5
181  cmd="$1 > $2 2> $3"
182  if test -n "$error"; then
183    cmd="$1 1> $2  2>&1"
184  fi
185  echo "$cmd" > ${tlabel}.sh; chmod 755 ${tlabel}.sh
186  if $printcmd; then
187     printcmd "$cmd"
188  fi
189
190  eval "{ time -p $cmd ; } 2>> timing.out"
191  cmd_res=$?
192  # If testing the error output then we don't test the error code itself
193  if test -n "$error"; then
194     cmd_res=0
195  fi
196  #  If it is a lack of GPU resources or MPI failure (Intel) then try once more
197  #  See: src/sys/error/err.c
198  #  Error #134 added to handle problems with the Radeon card for hip testing
199  if [ $cmd_res -eq 96 -o $cmd_res -eq 97 -o $cmd_res -eq 98 -o $cmd_res -eq 134 ]; then
200    printf "# retrying ${tlabel}\n" | tee -a ${testlogerrfile}
201    sleep 3
202    eval "{ time -p $cmd ; } 2>> timing.out"
203    cmd_res=$?
204  fi
205  touch "$2" "$3"
206  # It appears current MPICH and OpenMPI just shut down the job executation and do not return an error code to the executable
207  # ETIMEDOUT=110 was used by OpenMPI 3.0.  MPICH used 255
208  # Earlier OpenMPI versions returned 1 and the error string
209  if [ $cmd_res -eq 110 -o $cmd_res -eq 255 ] || \
210        fgrep -q -s 'APPLICATION TIMED OUT' "$2" "$3" || \
211        fgrep -q -s MPIEXEC_TIMEOUT "$2" "$3" || \
212        fgrep -q -s 'APPLICATION TERMINATED WITH THE EXIT STRING: job ending due to timeout' "$2" "$3" || \
213        grep -q -s "Timeout after [0-9]* seconds. Terminating job" "$2" "$3"; then
214    timed_out=1
215    # If timed out, then ensure non-zero error code
216    if [ $cmd_res -eq 0 ]; then
217      cmd_res=1
218    fi
219  fi
220
221  # Report errors
222  comment=""
223  if test $cmd_res == 0; then
224     if "${verbose}"; then
225        comment="${cmd}"
226     fi
227    petsc_report_tapoutput "" "$tlabel" "$comment"
228    let success=$success+1
229  else
230    if [ -n "$timed_out" ]; then
231      comment="Exceeded timeout limit of $MPIEXEC_TIMEOUT s"
232    else
233      comment="Error code: ${cmd_res}"
234    fi
235    petsc_report_tapoutput "not" "$tlabel" "$comment"
236
237    # Report errors in detail
238    if [ -z "$timed_out" ]; then
239      # We've had tests fail but stderr->stdout, as well as having
240      # mpi_abort go to stderr which throws this test off.  Show both
241      # with stdout first
242      awk '{print "#\t" $0}' < $2 | tee -a ${testlogerrfile}
243      # if statement is for diff tests
244      if test "$2" != "$3"; then
245        awk '{print "#\t" $0}' < $3 | tee -a ${testlogerrfile}
246      fi
247    fi
248    let failed=$failed+1
249    failures="$failures $tlabel"
250  fi
251  let total=$success+$failed
252  return $cmd_res
253}
254
255function petsc_testend() {
256  logfile=$1/counts/${label}.counts
257  logdir=`dirname $logfile`
258  if ! test -d "$logdir"; then
259    mkdir -p $logdir
260  fi
261  if ! test -e "$logfile"; then
262    touch $logfile
263  fi
264  printf "total $total\n" > $logfile
265  printf "success $success\n" >> $logfile
266  printf "failed $failed\n" >> $logfile
267  printf "failures $failures\n" >> $logfile
268  if test ${todo} -gt 0; then
269    printf "todo $todo\n" >> $logfile
270  fi
271  if test ${skip} -gt 0; then
272    printf "skip $skip\n" >> $logfile
273  fi
274  ENDTIME=`date +%s`
275  timing=`touch timing.out && egrep '(user|sys)' timing.out | awk '{if( sum1 == "" || $2 > sum1 ) { sum1=sprintf("%.2f",$2) } ; sum2 += sprintf("%.2f",$2)} END {printf "%.2f %.2f\n",sum1,sum2}'`
276  printf "time $timing\n" >> $logfile
277  if $cleanup; then
278    echo "Cleaning up"
279    /bin/rm -f $rmfiles
280  fi
281}
282
283function petsc_mpiexec_cudamemcheck() {
284  # loops over the argument list to find the call to the test executable and insert the
285  # cuda memcheck command before it.
286  # first check if compute-sanitizer exists, since cuda-memcheck is deprecated from CUDA
287  # 11-ish onwards
288  if command -v compute-sanitizer &> /dev/null; then
289    memcheck_cmd="${PETSC_CUDAMEMCHECK_COMMAND:-compute-sanitizer}"
290    declare -a default_args_to_check=('--target-processes all' '--track-stream-ordered-races all')
291  else
292    memcheck_cmd="${PETSC_CUDAMEMCHECK_COMMAND:-cuda-memcheck}"
293    declare -a default_args_to_check=('--flush-to-disk yes')
294  fi
295  if [[ -z ${PETSC_CUDAMEMCHECK_ARGS} ]]; then
296    # if user has not set the memcheck args themselves loop over the predefined default
297    # arguments and check if they can be used
298    memcheck_args='--leak-check full --report-api-errors no '
299    for option in "${default_args_to_check[@]}"; do
300      ${memcheck_cmd} ${memcheck_args} ${option} &> /dev/null
301      if [ $? -eq 0 ]; then
302        memcheck_args+="${option} "
303      fi
304    done
305  else
306    memcheck_args="${PETSC_CUDAMEMCHECK_ARGS}"
307  fi
308  pre_args=()
309  # regex to detect a path containing petsc-arch path, which is where the test lives. This
310  # marks the end of the options to mpiexec, and hence where we should insert the
311  # cuda-memcheck command
312  re=".*${petsc_arch}.*"
313  for i in "$@"; do
314    if [[ $i =~ ${re} ]]; then
315      # found it, put cuda memcheck command in
316      pre_args+=("${memcheck_cmd} ${memcheck_args}")
317      break
318    fi
319    pre_args+=("$i")
320    shift
321  done
322  # run command, but filter out
323  # ===== CUDA-MEMCHECK or ==== COMPUTE-SANITIZER
324  # and
325  # ===== ERROR SUMMARY: 0 errors
326  if ${printcmd}; then
327    echo ${pre_args[@]} $*
328  else
329    ${pre_args[@]} $* \
330      | grep -v 'CUDA-MEMCHECK' \
331      | grep -v 'COMPUTE-SANITIZER' \
332      | grep -v 'LEAK SUMMARY: 0 bytes leaked in 0 allocations' \
333      | grep -v 'ERROR SUMMARY: 0 errors' || [[ $? == 1 ]]
334  fi
335  # last or is needed to suppress grep exiting with error code 1 if it doesn't find a
336  # match
337}
338
339function petsc_mpiexec_valgrind() {
340  # some systems set $1 to be the function name
341  if [[ $1 == 'petsc_mpiexec_valgrind' ]]; then
342    shift
343  fi
344  _mpiexec=$1;shift
345  npopt=$1;shift
346  np=$1;shift
347
348  valgrind="valgrind -q --tool=memcheck --leak-check=yes --num-callers=20 --track-origins=yes --keep-debuginfo=yes --suppressions=$PETSC_DIR/share/petsc/valgrind/petsc-val.supp --error-exitcode=10"
349
350  if $printcmd; then
351     echo $_mpiexec $npopt $np $valgrind "$@"
352  else
353     $_mpiexec $npopt $np $valgrind "$@"
354  fi
355}
356export LC_ALL=C
357