xref: /petsc/config/petsc_harness.sh (revision 0b3a6bba120a0cdd0b26cdef79233fff1bc80a01)
1#!/bin/bash
2scriptname=$(basename "$0")
3rundir=${scriptname%.sh}
4TIMEOUT=60
5timeoutfactor=${timeoutfactor:=}
6filter=${filter:=}
7filter_output=${filter_output:=}
8exec=${exec:=}
9executable=${executable:=}
10petsc_dir=${petsc_dir:=}
11testlogtapfile=${testlogtapfile:=}
12testlogerrfile=${testlogerrfile:=}
13label=${label:=}
14
15if test "$PWD"!="$(dirname "$0")"; then
16  cd "$(dirname "$0")" || exit
17  abspath_scriptdir=$PWD
18fi
19if test -d "${rundir}" && test -n "${rundir}"; then
20  rm -f "${rundir}"/*.tmp "${rundir}"/*.err "${rundir}"/*.out
21fi
22mkdir -p "${rundir}"
23if test -n "${runfiles:=}"; then
24  for runfile in ${runfiles}; do
25      subdir=$(dirname "${runfile}")
26      mkdir -p "${rundir}"/"${subdir}"
27      cp -r "${runfile}" "${rundir}"/"${subdir}"
28  done
29fi
30cd "${rundir}" || exit
31
32#
33# Method to print out general and script specific options
34#
35print_usage() {
36
37cat >&2 <<EOF
38Usage: $1 [options]
39
40OPTIONS
41  -a <args> ......... Override default arguments
42  -c ................ Cleanup (remove generated files)
43  -C ................ Compile
44  -d ................ Launch in debugger
45  -e <args> ......... Add extra arguments to default
46  -E <args> ......... Add final arguments to default
47  -f ................ force attempt to run test that would otherwise be skipped
48  -h ................ help: print this message
49  -n <integer> ...... Override the number of processors to use
50  -j ................ Pass -j to petscdiff (just use diff)
51  -J <arg> .......... Pass -J to petscdiff (just use diff with arg)
52  -m ................ Update results using petscdiff
53  -M ................ Update alt files using petscdiff
54  -o <arg> .......... Output format: 'interactive', 'err_only'
55  -p ................ Print command:  Print first command and exit
56  -t ................ Override the default timeout (default=$TIMEOUT sec)
57  -U ................ run cUda-memcheck
58  -V ................ run Valgrind
59  -v ................ Verbose: Print commands
60EOF
61
62  if declare -f extrausage > /dev/null; then extrausage; fi
63  exit 1
64}
65###
66##  Arguments for overriding things
67#
68output_fmt="interactive"
69verbose=false
70cleanup=false
71compile=false
72debugger=false
73printcmd=false
74mpiexec_function=false
75force=false
76diff_flags=""
77while getopts "a:cCde:E:fhjJ:mMn:o:pt:UvV" arg
78do
79  case $arg in
80    a ) args="$OPTARG"       ;;
81    c ) cleanup=true         ;;
82    C ) compile=true         ;;
83    d ) debugger=true        ;;
84    e ) extra_args="$OPTARG" ;;
85    E ) final_args="$OPTARG" ;;
86    f ) force=true           ;;
87    h ) print_usage "$0"     ;;
88    n ) nsize="$OPTARG"      ;;
89    j ) diff_flags=$diff_flags" -j"      ;;
90    J ) diff_flags=$diff_flags" -J $OPTARG" ;;
91    m ) diff_flags=$diff_flags" -m"      ;;
92    M ) diff_flags=$diff_flags" -M"      ;;
93    o ) output_fmt=$OPTARG   ;;
94    p ) printcmd=true        ;;
95    t ) TIMEOUT=$OPTARG      ;;
96    U ) mpiexec="petsc_mpiexec_cudamemcheck $mpiexec"
97        mpiexec_function=true
98        ;;
99    V ) mpiexec="petsc_mpiexec_valgrind $mpiexec"
100        mpiexec_function=true
101        ;;
102    v ) verbose=true         ;;
103    *)  # To take care of any extra args
104      if test -n "$OPTARG"; then
105        eval "$arg"=\""$OPTARG"\"
106      else
107        eval "$arg"=found
108      fi
109      ;;
110  esac
111done
112shift $(( OPTIND - 1 ))
113
114# Individual tests can extend the default
115export MPIEXEC_TIMEOUT=$((TIMEOUT*timeoutfactor))
116STARTTIME=$(date +%s)
117
118if test -n "$extra_args"; then
119  args="$extra_args $args"
120fi
121if test -n "$final_args"; then
122  args="$args $final_args"
123fi
124if $debugger; then
125  args="-start_in_debugger $args"
126fi
127if test -n "$filter"; then
128  diff_flags=$diff_flags" -F \$'$filter'"
129fi
130if test -n "$filter_output"; then
131  diff_flags=$diff_flags" -f \$'$filter_output'"
132fi
133
134# Init
135success=0; failed=0; failures=""; rmfiles=""
136total=0
137todo=-1; skip=-1
138job_level=0
139
140if $compile; then
141   curexec=$(basename "${exec}")
142   fullexec=${abspath_scriptdir}/${curexec}
143   maketarget=$(echo "${fullexec}" | sed "s#${petsc_dir}/*##")
144   (cd "$petsc_dir" && make -f gmakefile.test "${maketarget}")
145fi
146
147###
148##   Rest of code is functions
149#
150function petsc_report_tapoutput() {
151  notornot=$1
152  test_label=$2
153  comment=$3
154  if test -n "$comment"; then
155    comment=" # ${comment}"
156  fi
157
158  tap_message="${notornot} ok ${test_label}${comment}"
159
160  # Log messages
161  printf '%s\n' "${tap_message}" >> "${testlogtapfile}"
162
163  if test "${output_fmt}" == "err_only"; then
164     if test -n "${notornot}"; then
165        printf '%s\n' "${tap_message}" | tee -a "${testlogerrfile}"
166     fi
167  else
168     printf '%s\n' "${tap_message}"
169  fi
170}
171
172function printcmd() {
173  # Print command that can be run from PETSC_DIR
174  cmd="$1"
175  basedir=$(dirname "${PWD}" | sed "s#${petsc_dir}/##")
176  modcmd=$(echo "${cmd}" | sed -e "s#\.\.#${basedir}#" | sed s#\>.*## | sed s#\%#\%\%#)
177  if $mpiexec_function; then
178     # Have to expand valgrind/cudamemcheck
179     modcmd=$(eval "$modcmd")
180  fi
181  printf '%s\n' "${modcmd}"
182  exit
183}
184
185function petsc_testrun() {
186  # First arg = Basic command
187  # Second arg = stdout file
188  # Third arg = stderr file
189  # Fourth arg = label for reporting
190  rmfiles="${rmfiles} $2 $3"
191  tlabel=$4
192  error=$5
193  cmd="$1 > $2 2> $3"
194  if test -n "$error"; then
195    cmd="$1 1> $2  2>&1"
196  fi
197  echo "$cmd" > "${tlabel}".sh; chmod 755 "${tlabel}".sh
198  if $printcmd; then
199     printcmd "$cmd"
200  fi
201
202  eval "{ time -p $cmd ; } 2>> timing.out"
203  cmd_res=$?
204  # If testing the error output then we don't test the error code itself
205  if test -n "$error"; then
206     cmd_res=0
207  fi
208  #  If it is a lack of GPU resources or MPI failure (Intel) then try once more
209  #  See: src/sys/error/err.c
210  #  Error #134 added to handle problems with the Radeon card for hip testing
211  #  Error #144 added to handle problems with the MPI [ch3:sock] received packet of unknown type (1852472100)
212  if [ $cmd_res -eq 96 ] || [ $cmd_res -eq 97 ] || [ $cmd_res -eq 98 ] || [ $cmd_res -eq 134 ] || [ $cmd_res -eq 144 ]; then
213    printf "# retrying %s\n" "${tlabel}" | tee -a "${testlogerrfile}"
214    sleep 3
215    eval "{ time -p $cmd ; } 2>> timing.out"
216    cmd_res=$?
217  fi
218  touch "$2" "$3"
219  # It appears current MPICH and Open MPI just shut down the job execution and do not return an error code to the executable
220  # ETIMEDOUT=110 was used by Open MPI 3.0.  MPICH used 255
221  # Earlier Open MPI versions returned 1 and the error string
222  # Here we only grep for error strings in output
223  #if [ $cmd_res -eq 110 -o $cmd_res -eq 255 ] || \
224  if \
225        grep -F -q -s 'I_MPI_JOB_TIMEOUT' "$2" "$3" || \
226        grep -F -q -s 'APPLICATION TIMED OUT' "$2" "$3" || \
227        grep -F -q -s MPIEXEC_TIMEOUT "$2" "$3" || \
228        grep -F -q -s 'APPLICATION TERMINATED WITH THE EXIT STRING: job ending due to timeout' "$2" "$3" || \
229        grep -q -s "Timeout after [0-9]* seconds. Terminating job" "$2" "$3"; then
230    timed_out=1
231    # If timed out, then ensure non-zero error code
232    if [ $cmd_res -eq 0 ]; then
233      cmd_res=1
234    fi
235  fi
236
237  # Report errors
238  comment=""
239  if test $cmd_res == 0; then
240     if "${verbose}"; then
241        comment="${cmd}"
242     fi
243    petsc_report_tapoutput "" "$tlabel" "$comment"
244    (( success=success+1 ))
245  else
246    if [ -n "$timed_out" ]; then
247      comment="Exceeded timeout limit of $MPIEXEC_TIMEOUT s"
248    else
249      comment="Error code: ${cmd_res}"
250    fi
251    petsc_report_tapoutput "not" "$tlabel" "$comment"
252
253    # Report errors in detail
254    if [ -z "$timed_out" ]; then
255      # We've had tests fail but stderr->stdout, as well as having
256      # mpi_abort go to stderr which throws this test off.  Show both
257      # with stdout first
258      awk '{print "#\t" $0}' < "$2" | tee -a "${testlogerrfile}"
259      # if statement is for diff tests
260      if test "$2" != "$3"; then
261        awk '{print "#\t" $0}' < "$3" | tee -a "${testlogerrfile}"
262      fi
263    fi
264    (( failed=failed+1 ))
265    failures="$failures $tlabel"
266  fi
267  (( total=success+failed ))
268  return $cmd_res
269}
270
271function petsc_testend() {
272  logfile=$1/counts/${label}.counts
273  logdir=$(dirname "$logfile")
274  if ! test -d "$logdir"; then
275    mkdir -p "$logdir"
276  fi
277  if ! test -e "$logfile"; then
278    touch "$logfile"
279  fi
280  printf "total %s\n" "$total" > "$logfile"
281  printf "success %s\n" "$success" >> "$logfile"
282  printf "failed %s\n" "$failed" >> "$logfile"
283  printf "failures %s\n" "$failures" >> "$logfile"
284  if test ${todo} -gt 0; then
285    printf "todo %s\n" "$todo" >> "$logfile"
286  fi
287  if test ${skip} -gt 0; then
288    printf "skip %s\n" "$skip" >> "$logfile"
289  fi
290  ENDTIME=$(date +%s)
291  timing=$(touch timing.out && grep -E '(user|sys)' timing.out | awk '{if( sum1 == "" || $2 > sum1 ) { sum1=sprintf("%.2f",$2) } ; sum2 += sprintf("%.2f",$2)} END {printf "%.2f %.2f\n",sum1,sum2}')
292  printf "time %s\n" "$timing" >> "$logfile"
293  if $cleanup; then
294    echo "Cleaning up"
295    /bin/rm -f "$rmfiles"
296  fi
297}
298
299function petsc_mpiexec_cudamemcheck() {
300  # loops over the argument list to find the call to the test executable and insert the
301  # cuda memcheck command before it.
302  # first check if compute-sanitizer exists, since cuda-memcheck is deprecated from CUDA
303  # 11-ish onwards
304  if command -v compute-sanitizer &> /dev/null; then
305    memcheck_cmd="${PETSC_CUDAMEMCHECK_COMMAND:-compute-sanitizer}"
306    declare -a default_args_to_check=('--target-processes all' '--track-stream-ordered-races all')
307  else
308    memcheck_cmd="${PETSC_CUDAMEMCHECK_COMMAND:-cuda-memcheck}"
309    declare -a default_args_to_check=('--flush-to-disk yes')
310  fi
311  if [[ -z ${PETSC_CUDAMEMCHECK_ARGS} ]]; then
312    # if user has not set the memcheck args themselves loop over the predefined default
313    # arguments and check if they can be used
314    memcheck_args='--leak-check full --report-api-errors no '
315    for option in "${default_args_to_check[@]}"; do
316      ${memcheck_cmd} "${memcheck_args}" "${option}" &> /dev/null
317      if [ $? -eq 0 ]; then
318        memcheck_args+="${option} "
319      fi
320    done
321  else
322    memcheck_args="${PETSC_CUDAMEMCHECK_ARGS}"
323  fi
324  pre_args=()
325  # regex to detect where the test lives in the command line. This
326  # marks the end of the options to mpiexec, and hence where we should insert the
327  # cuda-memcheck command
328  re="${executable}"
329  for i in "$@"; do
330    # first occurrence of the presence of petsc_arch is the executable,
331    # except when we install MPI ourselves
332    if [[ $i =~ ${re} ]]; then
333      # found it, put cuda memcheck command in
334      pre_args+=("${memcheck_cmd} ${memcheck_args}")
335      break
336    fi
337    pre_args+=("$i")
338    shift
339  done
340  # run command, but filter out
341  # ===== CUDA-MEMCHECK or ==== COMPUTE-SANITIZER
342  # and
343  # ===== ERROR SUMMARY: 0 errors
344  if ${printcmd}; then
345    echo "${pre_args[@]}" "$@"
346  else
347    "${pre_args[@]}" "$@" \
348      | grep -v 'CUDA-MEMCHECK' \
349      | grep -v 'COMPUTE-SANITIZER' \
350      | grep -v 'LEAK SUMMARY: 0 bytes leaked in 0 allocations' \
351      | grep -v 'ERROR SUMMARY: 0 errors' || [[ $? == 1 ]]
352  fi
353  # last or is needed to suppress grep exiting with error code 1 if it doesn't find a
354  # match
355}
356
357function petsc_mpiexec_valgrind() {
358  valgrind_cmd="valgrind -q --tool=memcheck --leak-check=yes --num-callers=20 --track-origins=yes --keep-debuginfo=yes --suppressions=${PETSC_DIR}/share/petsc/suppressions/valgrind --error-exitcode=10"
359  pre_args=()
360  re="${executable}"
361  for i in "$@"; do
362    if [[ $i =~ ${re} ]]; then
363      pre_args+=("${valgrind_cmd}")
364      break
365    fi
366    pre_args+=("$i")
367    shift
368  done
369  if ${printcmd}; then
370    echo ${pre_args[@]} "$@"
371  else
372    ${pre_args[@]} "$@"
373  fi
374}
375export LC_ALL=C
376