#!/bin/bash

#
# This is the watchdog repair binary. If this script exits with a zero status
# the watchdog will not reboot the device, any other exit status reboots the
# device.
#
# It is not used to repair the system (we prefer to reboot), except as
# explcitely noted below in necessary cases, so we must ensure
# that the watchdog gets a permanent failure. Otherwise a transient failure
# could result in finally not rebooting the system, for instance in the
# following sequence:
#    - transient failure detected by watchdog
#    - this script is called
#         - prepares the system for reboot
#         - returns zero to delay the reboot
#    - the transient failure disappears
#    - the watchdog daemon main loop re-runs but detects no failures
#
# The consequence is that the system does not reboot, but it was prepared
# to reboot (for instance some services have been disabled). An example of
# transient failures is file that have not changed in the prescribed time.
#
# To avoid the above situation we use a dummy PID file, which is
# watched by the watchdog daemon. The PID in this file is 1, which is
# a process which always exists (init) so the test always succeeds as
# long as the file exists. This file is removed whenever we start
# this script, so that the watchdog always finds a failure condition after
# running this script.
#
# WARNING WARNING !!!!!!!
#
# Be very careful about modifying this script. It should never exit
# with a zero status by accident.
#
# This script should not block, since the watchdog daemon does not run nor
# acknowledge the hardware watchdog until we exit.
#
# We use nice -20 for sub-commands to be sure that they run promptly on a
# heavy loaded system.
#

#----------------------------------------------------------------------
# First check for things that we can repare or are transient conditions
#----------------------------------------------------------------------

check_restart()
{
    local indicator="$1"
    local maxold=60
    # Check that the file is not older than MAXOLD,
    # otherwise the problem is not ntpd
    timenow="$(date +%s)"
    timefile="$(stat -c %Y "$indicator" )"
    timediff=$(( timenow - timefile ))
    # We only allow the watchdog to ignore this condition if
    # the flag file can be successfully removed to avoid entering this
    # test in a loop if the removal fails for some reason
    if rm -f "$indicator" && [ "$timediff" -le "$maxold" ]; then
        # allow a bit more time for ntpd to restart before the
        # watchdog tries to check it again
        sleep 10
        exit 0
    fi
}

# ntpd needs to be restarted in some cases to pick up new configuration,
# but since it is watched by the watchdog daemon restarting it may trigger
# a watchdog failure that we overcome here.
# We only consider this case if the argument is 3=ESRCH (no such process)
# or 2=ENOENT (no such file, that is the pid file of ntpd was not there)
NTPDRESTART=/var/run/ntpd-restart
if [ \( "$1" = 2 -o "$1" = 3 \) -a -e "$NTPDRESTART" ]; then
    check_restart "$NTPDRESTART"
fi

# similarly to above spxupnpd may have been restarted due to a license change
SPXUPNPDRESTART=/var/run/spxmanage/spxupnpd-restart
if [ \( "$1" = 2 -o "$1" = 3 \) -a -e "$SPXUPNPDRESTART" ]; then
    check_restart "$SPXUPNPDRESTART"
fi

# one possible failure is that the system time was changed due to a
# large step change ("hard") synchronization to an external clock via
# NTP, this can trigger "file not changed" errors or "no such process"
# errors if ntpd exited due to panic mode (and hopefully stepped the time).
CHECKRTC=
OKIFRTCSYNCED=
NTPPIDFILE=/run/ntpd.pid
if [ "$1" -eq 250 ]; then
    CHECKRTC=yes
    OKIFRTCSYNCED=yes
elif [ "$1" -eq 3 -a -f "$NTPPIDFILE" ]; then
    NTPPID="$(< $NTPPIDFILE)"
    if [ -n "$NTPPID" -a ! -d /proc/"$NTPPID" ]; then
        # ntpd PID file valid but ntpd process no longer running
        CHECKRTC=yes
    fi
fi
if [ -n "$CHECKRTC" ]; then
    spxrtc -l -R -s
    if [ $? -eq 2 ]; then
        # the RTC was not in sync and was successfully set,
        # if really in sync we can consider that we have
        # repaired the situation
        spxrtc -l -R -c
        if [ $? -eq 0 -a "$OKIFRTCSYNCED" = "yes" ]; then
            exit 0
        fi
    fi
fi

#--------------------------------------------
# From here on a permanent failure is entered
#--------------------------------------------

# Remove the dummy PID file, to force a permanet failure condition
rm -f /var/run/watchdog-dummy.pid

# Check that updater is not running, if it is running it could be catastrophic
# to exit while an update is in progress, so we tell the watchdog that
# everything is fine.

if [ -x /etc/init.d/updater ]; then
    # exits with 0 if no updater running, 2 if running but within maximum wait
    # to exit and 1 if still running but maximum wait has elapsed
    nice -n -20 /etc/init.d/updater stop-nowait
    if [ $? -eq 2 ]; then
        # Updater running, we can still wait for it to stop
        exit 0
    fi
fi

if [ -n "$1" ]; then
    exit "$1"
else
    exit 2
fi
