Newer Older
f16725e 3 months ago History
102 lines | 2.642kb
Bogdan Timofte authored 3 months ago
1
#!/bin/bash
2

            
3
set -u
4

            
5
WATCH_BRIDGE="${WATCH_BRIDGE:-vmbr443}"
6
WATCH_IFACE="${WATCH_IFACE:-}"
7
COOLDOWN_SECONDS="${COOLDOWN_SECONDS:-30}"
8
HANG_PATTERN="${HANG_PATTERN:-Detected Hardware Unit Hang:}"
9

            
10
log() {
11
    printf '%s %s\n' "$(date -Is)" "$*" >&2
12
}
13

            
14
discover_watch_iface() {
15
    local candidate=""
16

            
17
    if [[ -n "$WATCH_IFACE" ]]; then
18
        printf '%s\n' "$WATCH_IFACE"
19
        return 0
20
    fi
21

            
22
    if [[ -r /etc/network/interfaces ]]; then
23
        candidate="$(
24
            awk -v bridge="$WATCH_BRIDGE" '
25
                $1 == "iface" && $2 == bridge { in_bridge = 1; next }
26
                $1 == "iface" && $2 != bridge { in_bridge = 0 }
27
                in_bridge && $1 == "bridge-ports" { print $2; exit }
28
            ' /etc/network/interfaces
29
        )"
30
    fi
31

            
32
    if [[ -z "$candidate" && -d /etc/network/interfaces.d ]]; then
33
        candidate="$(
34
            awk -v bridge="$WATCH_BRIDGE" '
35
                $1 == "iface" && $2 == bridge { in_bridge = 1; next }
36
                $1 == "iface" && $2 != bridge { in_bridge = 0 }
37
                in_bridge && $1 == "bridge-ports" { print $2; exit }
38
            ' /etc/network/interfaces.d/* 2>/dev/null
39
        )"
40
    fi
41

            
42
    if [[ -n "$candidate" ]]; then
43
        printf '%s\n' "${candidate%%.*}"
44
        return 0
45
    fi
46

            
47
    return 1
48
}
49

            
50
require_command() {
51
    local cmd="$1"
52
    if ! command -v "$cmd" >/dev/null 2>&1; then
53
        log "missing required command: $cmd"
54
        exit 1
55
    fi
56
}
57

            
58
recover_iface() {
59
    local iface="$1"
60

            
61
    log "hardware hang detected on $iface; cycling link with ifdown/ifup"
62
    ifdown --force "$iface" || log "ifdown reported a non-zero exit code for $iface"
63
    sleep 2
64
    if ! ifup "$iface"; then
65
        log "ifup failed for $iface"
66
        return 1
67
    fi
68
    log "link recovery finished for $iface"
69
}
70

            
71
main() {
72
    local iface=""
73
    local last_recovery=0
74
    local now=0
75
    local line=""
76

            
77
    require_command journalctl
78
    require_command ifdown
79
    require_command ifup
80

            
81
    if ! iface="$(discover_watch_iface)"; then
82
        log "failed to determine uplink interface for bridge $WATCH_BRIDGE"
83
        exit 1
84
    fi
85

            
86
    log "watching journald for '$HANG_PATTERN' on interface $iface"
87

            
88
    while IFS= read -r line; do
89
        [[ "$line" == *"$iface: $HANG_PATTERN"* ]] || continue
90

            
91
        now="$(date +%s)"
92
        if (( now - last_recovery < COOLDOWN_SECONDS )); then
93
            log "skipping duplicate event for $iface during cooldown (${COOLDOWN_SECONDS}s)"
94
            continue
95
        fi
96

            
97
        last_recovery="$now"
98
        recover_iface "$iface"
99
    done < <(journalctl --dmesg --follow --since now --output=cat)
100
}
101

            
102
main "$@"