#!/bin/bash set -u WATCH_BRIDGE="${WATCH_BRIDGE:-vmbr443}" WATCH_IFACE="${WATCH_IFACE:-}" COOLDOWN_SECONDS="${COOLDOWN_SECONDS:-30}" HANG_PATTERN="${HANG_PATTERN:-Detected Hardware Unit Hang:}" log() { printf '%s %s\n' "$(date -Is)" "$*" >&2 } discover_watch_iface() { local candidate="" if [[ -n "$WATCH_IFACE" ]]; then printf '%s\n' "$WATCH_IFACE" return 0 fi if [[ -r /etc/network/interfaces ]]; then candidate="$( awk -v bridge="$WATCH_BRIDGE" ' $1 == "iface" && $2 == bridge { in_bridge = 1; next } $1 == "iface" && $2 != bridge { in_bridge = 0 } in_bridge && $1 == "bridge-ports" { print $2; exit } ' /etc/network/interfaces )" fi if [[ -z "$candidate" && -d /etc/network/interfaces.d ]]; then candidate="$( awk -v bridge="$WATCH_BRIDGE" ' $1 == "iface" && $2 == bridge { in_bridge = 1; next } $1 == "iface" && $2 != bridge { in_bridge = 0 } in_bridge && $1 == "bridge-ports" { print $2; exit } ' /etc/network/interfaces.d/* 2>/dev/null )" fi if [[ -n "$candidate" ]]; then printf '%s\n' "${candidate%%.*}" return 0 fi return 1 } require_command() { local cmd="$1" if ! command -v "$cmd" >/dev/null 2>&1; then log "missing required command: $cmd" exit 1 fi } recover_iface() { local iface="$1" log "hardware hang detected on $iface; cycling link with ifdown/ifup" ifdown --force "$iface" || log "ifdown reported a non-zero exit code for $iface" sleep 2 if ! ifup "$iface"; then log "ifup failed for $iface" return 1 fi log "link recovery finished for $iface" } main() { local iface="" local last_recovery=0 local now=0 local line="" require_command journalctl require_command ifdown require_command ifup if ! iface="$(discover_watch_iface)"; then log "failed to determine uplink interface for bridge $WATCH_BRIDGE" exit 1 fi log "watching journald for '$HANG_PATTERN' on interface $iface" while IFS= read -r line; do [[ "$line" == *"$iface: $HANG_PATTERN"* ]] || continue now="$(date +%s)" if (( now - last_recovery < COOLDOWN_SECONDS )); then log "skipping duplicate event for $iface during cooldown (${COOLDOWN_SECONDS}s)" continue fi last_recovery="$now" recover_iface "$iface" done < <(journalctl --dmesg --follow --since now --output=cat) } main "$@"