f16725e 3 months ago History
1 contributor
102 lines | 2.642kb
#!/bin/bash

set -u

WATCH_BRIDGE="${WATCH_BRIDGE:-vmbr443}"
WATCH_IFACE="${WATCH_IFACE:-}"
COOLDOWN_SECONDS="${COOLDOWN_SECONDS:-30}"
HANG_PATTERN="${HANG_PATTERN:-Detected Hardware Unit Hang:}"

log() {
    printf '%s %s\n' "$(date -Is)" "$*" >&2
}

discover_watch_iface() {
    local candidate=""

    if [[ -n "$WATCH_IFACE" ]]; then
        printf '%s\n' "$WATCH_IFACE"
        return 0
    fi

    if [[ -r /etc/network/interfaces ]]; then
        candidate="$(
            awk -v bridge="$WATCH_BRIDGE" '
                $1 == "iface" && $2 == bridge { in_bridge = 1; next }
                $1 == "iface" && $2 != bridge { in_bridge = 0 }
                in_bridge && $1 == "bridge-ports" { print $2; exit }
            ' /etc/network/interfaces
        )"
    fi

    if [[ -z "$candidate" && -d /etc/network/interfaces.d ]]; then
        candidate="$(
            awk -v bridge="$WATCH_BRIDGE" '
                $1 == "iface" && $2 == bridge { in_bridge = 1; next }
                $1 == "iface" && $2 != bridge { in_bridge = 0 }
                in_bridge && $1 == "bridge-ports" { print $2; exit }
            ' /etc/network/interfaces.d/* 2>/dev/null
        )"
    fi

    if [[ -n "$candidate" ]]; then
        printf '%s\n' "${candidate%%.*}"
        return 0
    fi

    return 1
}

require_command() {
    local cmd="$1"
    if ! command -v "$cmd" >/dev/null 2>&1; then
        log "missing required command: $cmd"
        exit 1
    fi
}

recover_iface() {
    local iface="$1"

    log "hardware hang detected on $iface; cycling link with ifdown/ifup"
    ifdown --force "$iface" || log "ifdown reported a non-zero exit code for $iface"
    sleep 2
    if ! ifup "$iface"; then
        log "ifup failed for $iface"
        return 1
    fi
    log "link recovery finished for $iface"
}

main() {
    local iface=""
    local last_recovery=0
    local now=0
    local line=""

    require_command journalctl
    require_command ifdown
    require_command ifup

    if ! iface="$(discover_watch_iface)"; then
        log "failed to determine uplink interface for bridge $WATCH_BRIDGE"
        exit 1
    fi

    log "watching journald for '$HANG_PATTERN' on interface $iface"

    while IFS= read -r line; do
        [[ "$line" == *"$iface: $HANG_PATTERN"* ]] || continue

        now="$(date +%s)"
        if (( now - last_recovery < COOLDOWN_SECONDS )); then
            log "skipping duplicate event for $iface during cooldown (${COOLDOWN_SECONDS}s)"
            continue
        fi

        last_recovery="$now"
        recover_iface "$iface"
    done < <(journalctl --dmesg --follow --since now --output=cat)
}

main "$@"