| 1 |
#!/bin/bash |
|
| 2 | ||
| 3 |
set -u |
|
| 4 | ||
| 5 |
WATCH_BRIDGE="${WATCH_BRIDGE:-vmbr443}"
|
|
| 6 |
WATCH_IFACE="${WATCH_IFACE:-}"
|
|
| 7 |
COOLDOWN_SECONDS="${COOLDOWN_SECONDS:-30}"
|
|
| 8 |
HANG_PATTERN="${HANG_PATTERN:-Detected Hardware Unit Hang:}"
|
|
| 9 | ||
| 10 |
log() {
|
|
| 11 |
printf '%s %s\n' "$(date -Is)" "$*" >&2 |
|
| 12 |
} |
|
| 13 | ||
| 14 |
discover_watch_iface() {
|
|
| 15 |
local candidate="" |
|
| 16 | ||
| 17 |
if [[ -n "$WATCH_IFACE" ]]; then |
|
| 18 |
printf '%s\n' "$WATCH_IFACE" |
|
| 19 |
return 0 |
|
| 20 |
fi |
|
| 21 | ||
| 22 |
if [[ -r /etc/network/interfaces ]]; then |
|
| 23 |
candidate="$( |
|
| 24 |
awk -v bridge="$WATCH_BRIDGE" ' |
|
| 25 |
$1 == "iface" && $2 == bridge { in_bridge = 1; next }
|
|
| 26 |
$1 == "iface" && $2 != bridge { in_bridge = 0 }
|
|
| 27 |
in_bridge && $1 == "bridge-ports" { print $2; exit }
|
|
| 28 |
' /etc/network/interfaces |
|
| 29 |
)" |
|
| 30 |
fi |
|
| 31 | ||
| 32 |
if [[ -z "$candidate" && -d /etc/network/interfaces.d ]]; then |
|
| 33 |
candidate="$( |
|
| 34 |
awk -v bridge="$WATCH_BRIDGE" ' |
|
| 35 |
$1 == "iface" && $2 == bridge { in_bridge = 1; next }
|
|
| 36 |
$1 == "iface" && $2 != bridge { in_bridge = 0 }
|
|
| 37 |
in_bridge && $1 == "bridge-ports" { print $2; exit }
|
|
| 38 |
' /etc/network/interfaces.d/* 2>/dev/null |
|
| 39 |
)" |
|
| 40 |
fi |
|
| 41 | ||
| 42 |
if [[ -n "$candidate" ]]; then |
|
| 43 |
printf '%s\n' "${candidate%%.*}"
|
|
| 44 |
return 0 |
|
| 45 |
fi |
|
| 46 | ||
| 47 |
return 1 |
|
| 48 |
} |
|
| 49 | ||
| 50 |
require_command() {
|
|
| 51 |
local cmd="$1" |
|
| 52 |
if ! command -v "$cmd" >/dev/null 2>&1; then |
|
| 53 |
log "missing required command: $cmd" |
|
| 54 |
exit 1 |
|
| 55 |
fi |
|
| 56 |
} |
|
| 57 | ||
| 58 |
recover_iface() {
|
|
| 59 |
local iface="$1" |
|
| 60 | ||
| 61 |
log "hardware hang detected on $iface; cycling link with ifdown/ifup" |
|
| 62 |
ifdown --force "$iface" || log "ifdown reported a non-zero exit code for $iface" |
|
| 63 |
sleep 2 |
|
| 64 |
if ! ifup "$iface"; then |
|
| 65 |
log "ifup failed for $iface" |
|
| 66 |
return 1 |
|
| 67 |
fi |
|
| 68 |
log "link recovery finished for $iface" |
|
| 69 |
} |
|
| 70 | ||
| 71 |
main() {
|
|
| 72 |
local iface="" |
|
| 73 |
local last_recovery=0 |
|
| 74 |
local now=0 |
|
| 75 |
local line="" |
|
| 76 | ||
| 77 |
require_command journalctl |
|
| 78 |
require_command ifdown |
|
| 79 |
require_command ifup |
|
| 80 | ||
| 81 |
if ! iface="$(discover_watch_iface)"; then |
|
| 82 |
log "failed to determine uplink interface for bridge $WATCH_BRIDGE" |
|
| 83 |
exit 1 |
|
| 84 |
fi |
|
| 85 | ||
| 86 |
log "watching journald for '$HANG_PATTERN' on interface $iface" |
|
| 87 | ||
| 88 |
while IFS= read -r line; do |
|
| 89 |
[[ "$line" == *"$iface: $HANG_PATTERN"* ]] || continue |
|
| 90 | ||
| 91 |
now="$(date +%s)" |
|
| 92 |
if (( now - last_recovery < COOLDOWN_SECONDS )); then |
|
| 93 |
log "skipping duplicate event for $iface during cooldown (${COOLDOWN_SECONDS}s)"
|
|
| 94 |
continue |
|
| 95 |
fi |
|
| 96 | ||
| 97 |
last_recovery="$now" |
|
| 98 |
recover_iface "$iface" |
|
| 99 |
done < <(journalctl --dmesg --follow --since now --output=cat) |
|
| 100 |
} |
|
| 101 | ||
| 102 |
main "$@" |