1 contributor
#!/bin/bash
set -u
WATCH_BRIDGE="${WATCH_BRIDGE:-vmbr443}"
WATCH_IFACE="${WATCH_IFACE:-}"
COOLDOWN_SECONDS="${COOLDOWN_SECONDS:-30}"
HANG_PATTERN="${HANG_PATTERN:-Detected Hardware Unit Hang:}"
log() {
printf '%s %s\n' "$(date -Is)" "$*" >&2
}
discover_watch_iface() {
local candidate=""
if [[ -n "$WATCH_IFACE" ]]; then
printf '%s\n' "$WATCH_IFACE"
return 0
fi
if [[ -r /etc/network/interfaces ]]; then
candidate="$(
awk -v bridge="$WATCH_BRIDGE" '
$1 == "iface" && $2 == bridge { in_bridge = 1; next }
$1 == "iface" && $2 != bridge { in_bridge = 0 }
in_bridge && $1 == "bridge-ports" { print $2; exit }
' /etc/network/interfaces
)"
fi
if [[ -z "$candidate" && -d /etc/network/interfaces.d ]]; then
candidate="$(
awk -v bridge="$WATCH_BRIDGE" '
$1 == "iface" && $2 == bridge { in_bridge = 1; next }
$1 == "iface" && $2 != bridge { in_bridge = 0 }
in_bridge && $1 == "bridge-ports" { print $2; exit }
' /etc/network/interfaces.d/* 2>/dev/null
)"
fi
if [[ -n "$candidate" ]]; then
printf '%s\n' "${candidate%%.*}"
return 0
fi
return 1
}
require_command() {
local cmd="$1"
if ! command -v "$cmd" >/dev/null 2>&1; then
log "missing required command: $cmd"
exit 1
fi
}
recover_iface() {
local iface="$1"
log "hardware hang detected on $iface; cycling link with ifdown/ifup"
ifdown --force "$iface" || log "ifdown reported a non-zero exit code for $iface"
sleep 2
if ! ifup "$iface"; then
log "ifup failed for $iface"
return 1
fi
log "link recovery finished for $iface"
}
main() {
local iface=""
local last_recovery=0
local now=0
local line=""
require_command journalctl
require_command ifdown
require_command ifup
if ! iface="$(discover_watch_iface)"; then
log "failed to determine uplink interface for bridge $WATCH_BRIDGE"
exit 1
fi
log "watching journald for '$HANG_PATTERN' on interface $iface"
while IFS= read -r line; do
[[ "$line" == *"$iface: $HANG_PATTERN"* ]] || continue
now="$(date +%s)"
if (( now - last_recovery < COOLDOWN_SECONDS )); then
log "skipping duplicate event for $iface during cooldown (${COOLDOWN_SECONDS}s)"
continue
fi
last_recovery="$now"
recover_iface "$iface"
done < <(journalctl --dmesg --follow --since now --output=cat)
}
main "$@"