|
Bogdan Timofte
authored
3 months ago
|
1
|
#!/usr/bin/env bash
|
|
|
2
|
set -euo pipefail
|
|
|
3
|
|
|
|
4
|
BRIDGE="thunderbridge"
|
|
|
5
|
MTU="65520"
|
|
|
6
|
FOUND_TB_IFACE=0
|
|
|
7
|
STATE_DIR="/run/tb-recover"
|
|
|
8
|
LAST_BOLT_RESTART_FILE="${STATE_DIR}/last_bolt_restart_epoch"
|
|
|
9
|
BOLT_RESTART_COOLDOWN_SEC=600
|
|
|
10
|
LAST_NHI_RESCAN_FILE="${STATE_DIR}/last_nhi_rescan_epoch"
|
|
|
11
|
NHI_RESCAN_COOLDOWN_SEC=600
|
|
|
12
|
NHI_SETTLE_SEC=8
|
|
|
13
|
PEER_FAIL_THRESHOLD="${TB_PEER_FAIL_THRESHOLD:-2}"
|
|
|
14
|
IFACE_CYCLE_COOLDOWN_SEC="${TB_IFACE_CYCLE_COOLDOWN_SEC:-300}"
|
|
|
15
|
IFACE_CYCLE_SETTLE_SEC="${TB_IFACE_CYCLE_SETTLE_SEC:-5}"
|
|
|
16
|
PING_TIMEOUT_SEC="${TB_PING_TIMEOUT_SEC:-1}"
|
|
|
17
|
LOCAL_HOST="$(hostname -s 2>/dev/null || hostname)"
|
|
|
18
|
|
|
|
19
|
mkdir -p "$STATE_DIR"
|
|
|
20
|
|
|
|
21
|
log() {
|
|
|
22
|
printf '%s %s\n' "$(date -Is)" "$*"
|
|
|
23
|
}
|
|
|
24
|
|
|
|
25
|
command_exists() {
|
|
|
26
|
command -v "$1" >/dev/null 2>&1
|
|
|
27
|
}
|
|
|
28
|
|
|
|
29
|
counter_file_for_iface() {
|
|
|
30
|
printf '%s/peer-fail-%s.count\n' "$STATE_DIR" "$1"
|
|
|
31
|
}
|
|
|
32
|
|
|
|
33
|
cooldown_file_for_iface() {
|
|
|
34
|
printf '%s/last-iface-cycle-%s.epoch\n' "$STATE_DIR" "$1"
|
|
|
35
|
}
|
|
|
36
|
|
|
|
37
|
read_epoch_file() {
|
|
|
38
|
local file="$1"
|
|
|
39
|
local value="0"
|
|
|
40
|
|
|
|
41
|
if [ -f "$file" ]; then
|
|
|
42
|
value="$(cat "$file" 2>/dev/null || echo 0)"
|
|
|
43
|
fi
|
|
|
44
|
|
|
|
45
|
case "$value" in
|
|
|
46
|
''|*[!0-9]*)
|
|
|
47
|
value=0
|
|
|
48
|
;;
|
|
|
49
|
esac
|
|
|
50
|
|
|
|
51
|
printf '%s\n' "$value"
|
|
|
52
|
}
|
|
|
53
|
|
|
|
54
|
read_counter_file() {
|
|
|
55
|
read_epoch_file "$1"
|
|
|
56
|
}
|
|
|
57
|
|
|
|
58
|
peer_ip_for_iface() {
|
|
|
59
|
local iface="$1"
|
|
|
60
|
|
|
|
61
|
case "${LOCAL_HOST}:${iface}" in
|
|
|
62
|
baobab:thunderbolt0)
|
|
|
63
|
printf '%s\n' "192.168.10.92"
|
|
|
64
|
;;
|
|
|
65
|
baobab:thunderbolt1)
|
|
|
66
|
printf '%s\n' "192.168.10.93"
|
|
|
67
|
;;
|
|
|
68
|
ebony:thunderbolt0)
|
|
|
69
|
printf '%s\n' "192.168.10.91"
|
|
|
70
|
;;
|
|
|
71
|
tapia:thunderbolt0)
|
|
|
72
|
printf '%s\n' "192.168.10.91"
|
|
|
73
|
;;
|
|
|
74
|
*)
|
|
|
75
|
return 1
|
|
|
76
|
;;
|
|
|
77
|
esac
|
|
|
78
|
}
|
|
|
79
|
|
|
|
80
|
iface_is_forwarding() {
|
|
|
81
|
local iface="$1"
|
|
|
82
|
local state_file="/sys/class/net/${iface}/brport/state"
|
|
|
83
|
|
|
|
84
|
[ -r "$state_file" ] || return 1
|
|
|
85
|
[ "$(cat "$state_file" 2>/dev/null || echo 0)" = "3" ]
|
|
|
86
|
}
|
|
|
87
|
|
|
|
88
|
iface_is_oper_up() {
|
|
|
89
|
local iface="$1"
|
|
|
90
|
local operstate_file="/sys/class/net/${iface}/operstate"
|
|
|
91
|
|
|
|
92
|
[ -r "$operstate_file" ] || return 1
|
|
|
93
|
[ "$(cat "$operstate_file" 2>/dev/null || true)" = "up" ]
|
|
|
94
|
}
|
|
|
95
|
|
|
|
96
|
probe_peer_ip() {
|
|
|
97
|
local peer_ip="$1"
|
|
|
98
|
|
|
|
99
|
ip neigh del "$peer_ip" dev "$BRIDGE" 2>/dev/null || true
|
|
|
100
|
ping -I "$BRIDGE" -n -c 1 -W "$PING_TIMEOUT_SEC" "$peer_ip" >/dev/null 2>&1
|
|
|
101
|
}
|
|
|
102
|
|
|
|
103
|
recover_iface_cycle() {
|
|
|
104
|
local iface="$1"
|
|
|
105
|
local peer_ip="$2"
|
|
|
106
|
local now
|
|
|
107
|
local last_cycle
|
|
|
108
|
local cooldown_file
|
|
|
109
|
|
|
|
110
|
now="$(date +%s)"
|
|
|
111
|
cooldown_file="$(cooldown_file_for_iface "$iface")"
|
|
|
112
|
last_cycle="$(read_epoch_file "$cooldown_file")"
|
|
|
113
|
if [ $((now - last_cycle)) -lt "$IFACE_CYCLE_COOLDOWN_SEC" ]; then
|
|
|
114
|
log "peer ${peer_ip} still unhealthy on ${iface}, but iface cycle is cooling down"
|
|
|
115
|
return 0
|
|
|
116
|
fi
|
|
|
117
|
|
|
|
118
|
log "peer ${peer_ip} unhealthy on ${iface}; cycling link with ifdown/ifup"
|
|
|
119
|
if command_exists ifdown && command_exists ifup; then
|
|
|
120
|
ifdown --force "$iface" || log "ifdown reported a non-zero exit code for ${iface}"
|
|
|
121
|
sleep 2
|
|
|
122
|
if ! ifup "$iface"; then
|
|
|
123
|
log "ifup failed for ${iface}"
|
|
|
124
|
return 1
|
|
|
125
|
fi
|
|
|
126
|
else
|
|
|
127
|
log "ifdown/ifup unavailable; falling back to ip link bounce for ${iface}"
|
|
|
128
|
ip link set "$iface" down || true
|
|
|
129
|
sleep 2
|
|
|
130
|
ip link set "$iface" up || true
|
|
|
131
|
fi
|
|
|
132
|
|
|
|
133
|
ip link set "$iface" mtu "$MTU" || true
|
|
|
134
|
ip link set "$iface" master "$BRIDGE" || true
|
|
|
135
|
systemctl start "tb-enlist@${iface}.service" || true
|
|
|
136
|
printf '%s\n' "$now" > "$cooldown_file"
|
|
|
137
|
rm -f "$(counter_file_for_iface "$iface")"
|
|
|
138
|
sleep "$IFACE_CYCLE_SETTLE_SEC"
|
|
|
139
|
}
|
|
|
140
|
|
|
|
141
|
assess_peer_health() {
|
|
|
142
|
local iface="$1"
|
|
|
143
|
local peer_ip=""
|
|
|
144
|
local counter_file=""
|
|
|
145
|
local fail_count=0
|
|
|
146
|
|
|
|
147
|
if ! peer_ip="$(peer_ip_for_iface "$iface")"; then
|
|
|
148
|
return 0
|
|
|
149
|
fi
|
|
|
150
|
|
|
|
151
|
counter_file="$(counter_file_for_iface "$iface")"
|
|
|
152
|
|
|
|
153
|
if ! iface_is_oper_up "$iface" || ! iface_is_forwarding "$iface"; then
|
|
|
154
|
rm -f "$counter_file"
|
|
|
155
|
return 0
|
|
|
156
|
fi
|
|
|
157
|
|
|
|
158
|
if probe_peer_ip "$peer_ip"; then
|
|
|
159
|
rm -f "$counter_file"
|
|
|
160
|
return 0
|
|
|
161
|
fi
|
|
|
162
|
|
|
|
163
|
fail_count="$(read_counter_file "$counter_file")"
|
|
|
164
|
fail_count=$((fail_count + 1))
|
|
|
165
|
printf '%s\n' "$fail_count" > "$counter_file"
|
|
|
166
|
log "peer probe failed on ${iface} towards ${peer_ip} (${fail_count}/${PEER_FAIL_THRESHOLD})"
|
|
|
167
|
|
|
|
168
|
if [ "$fail_count" -lt "$PEER_FAIL_THRESHOLD" ]; then
|
|
|
169
|
return 0
|
|
|
170
|
fi
|
|
|
171
|
|
|
|
172
|
recover_iface_cycle "$iface" "$peer_ip"
|
|
|
173
|
}
|
|
|
174
|
|
|
|
175
|
has_tb_netdev() {
|
|
|
176
|
ls /sys/class/net/thunderbolt* >/dev/null 2>&1
|
|
|
177
|
}
|
|
|
178
|
|
|
|
179
|
has_stale_tb_xdomain() {
|
|
|
180
|
local dev=""
|
|
|
181
|
for dev in /sys/bus/thunderbolt/devices/[0-9]-[1-9]*; do
|
|
|
182
|
[ -e "$dev" ] || continue
|
|
|
183
|
case "${dev##*/}" in
|
|
|
184
|
*.*|*:*)
|
|
|
185
|
continue
|
|
|
186
|
;;
|
|
|
187
|
esac
|
|
|
188
|
|
|
|
189
|
if ! ls "${dev}".* >/dev/null 2>&1; then
|
|
|
190
|
return 0
|
|
|
191
|
fi
|
|
|
192
|
done
|
|
|
193
|
|
|
|
194
|
return 1
|
|
|
195
|
}
|
|
|
196
|
|
|
|
197
|
trigger_tb_rescan() {
|
|
|
198
|
local domain=""
|
|
|
199
|
for domain in /sys/bus/thunderbolt/devices/domain*; do
|
|
|
200
|
[ -e "$domain/rescan" ] && echo 1 > "$domain/rescan" || true
|
|
|
201
|
done
|
|
|
202
|
|
|
|
203
|
udevadm trigger --subsystem-match=thunderbolt --action=change || true
|
|
|
204
|
udevadm trigger --subsystem-match=net --action=add || true
|
|
|
205
|
}
|
|
|
206
|
|
|
|
207
|
run_nhi_rescan() {
|
|
|
208
|
local epoch="$1"
|
|
|
209
|
local dev=""
|
|
|
210
|
local cls=""
|
|
|
211
|
local drv=""
|
|
|
212
|
local nhi_pci=""
|
|
|
213
|
|
|
|
214
|
for dev in /sys/bus/pci/devices/*; do
|
|
|
215
|
[ -e "$dev/class" ] || continue
|
|
|
216
|
[ -e "$dev/driver" ] || continue
|
|
|
217
|
[ -w "$dev/remove" ] || continue
|
|
|
218
|
cls="$(cat "$dev/class" 2>/dev/null || true)"
|
|
|
219
|
drv="$(basename "$(readlink -f "$dev/driver" 2>/dev/null || true)")"
|
|
|
220
|
if [ "$cls" = "0x088000" ] && [ "$drv" = "thunderbolt" ]; then
|
|
|
221
|
nhi_pci="$dev"
|
|
|
222
|
break
|
|
|
223
|
fi
|
|
|
224
|
done
|
|
|
225
|
|
|
|
226
|
if [ -n "$nhi_pci" ]; then
|
|
|
227
|
echo 1 > "$nhi_pci/remove" || true
|
|
|
228
|
sleep 1
|
|
|
229
|
echo 1 > /sys/bus/pci/rescan || true
|
|
|
230
|
printf '%s\n' "$epoch" > "$LAST_NHI_RESCAN_FILE"
|
|
|
231
|
return 0
|
|
|
232
|
fi
|
|
|
233
|
|
|
|
234
|
return 1
|
|
|
235
|
}
|
|
|
236
|
|
|
|
237
|
# Keep the bridge present and up before trying to enslave ports.
|
|
|
238
|
ip link show "$BRIDGE" >/dev/null 2>&1 || ip link add name "$BRIDGE" type bridge || true
|
|
|
239
|
ip link set "$BRIDGE" mtu "$MTU" || true
|
|
|
240
|
ip link set "$BRIDGE" up || true
|
|
|
241
|
|
|
|
242
|
for path in /sys/class/net/thunderbolt*; do
|
|
|
243
|
[ -e "$path" ] || continue
|
|
|
244
|
IFACE="${path##*/}"
|
|
|
245
|
FOUND_TB_IFACE=1
|
|
|
246
|
ip link set "$IFACE" up || true
|
|
|
247
|
ip link set "$IFACE" mtu "$MTU" || true
|
|
|
248
|
ip link set "$IFACE" master "$BRIDGE" || true
|
|
|
249
|
systemctl start "tb-enlist@${IFACE}.service" || true
|
|
|
250
|
done
|
|
|
251
|
|
|
|
252
|
# If no thunderbolt netdev exists but a TB domain exists, force a rescan + udev retrigger.
|
|
|
253
|
if [ "$FOUND_TB_IFACE" -eq 0 ] && [ -d /sys/bus/thunderbolt/devices ]; then
|
|
|
254
|
trigger_tb_rescan
|
|
|
255
|
|
|
|
256
|
# Escalate with cooldown: try PCI NHI remove+rescan to emulate a soft replug.
|
|
|
257
|
sleep 2
|
|
|
258
|
if ! has_tb_netdev; then
|
|
|
259
|
now="$(date +%s)"
|
|
|
260
|
last="0"
|
|
|
261
|
if [ -f "$LAST_BOLT_RESTART_FILE" ]; then
|
|
|
262
|
last="$(cat "$LAST_BOLT_RESTART_FILE" 2>/dev/null || echo 0)"
|
|
|
263
|
fi
|
|
|
264
|
|
|
|
265
|
case "$last" in
|
|
|
266
|
''|*[!0-9]*)
|
|
|
267
|
last=0
|
|
|
268
|
;;
|
|
|
269
|
esac
|
|
|
270
|
|
|
|
271
|
nhi_last="0"
|
|
|
272
|
if [ -f "$LAST_NHI_RESCAN_FILE" ]; then
|
|
|
273
|
nhi_last="$(cat "$LAST_NHI_RESCAN_FILE" 2>/dev/null || echo 0)"
|
|
|
274
|
fi
|
|
|
275
|
case "$nhi_last" in
|
|
|
276
|
''|*[!0-9]*)
|
|
|
277
|
nhi_last=0
|
|
|
278
|
;;
|
|
|
279
|
esac
|
|
|
280
|
|
|
|
281
|
if [ $((now - nhi_last)) -ge "$NHI_RESCAN_COOLDOWN_SEC" ]; then
|
|
|
282
|
if run_nhi_rescan "$now"; then
|
|
|
283
|
sleep "$NHI_SETTLE_SEC"
|
|
|
284
|
trigger_tb_rescan
|
|
|
285
|
|
|
|
286
|
# On newer kernels the first NHI reset can stop at the peer xdomain host
|
|
|
287
|
# node without recreating the matching *.0 network service.
|
|
|
288
|
if ! has_tb_netdev && has_stale_tb_xdomain; then
|
|
|
289
|
retry_now="$(date +%s)"
|
|
|
290
|
if run_nhi_rescan "$retry_now"; then
|
|
|
291
|
sleep "$NHI_SETTLE_SEC"
|
|
|
292
|
trigger_tb_rescan
|
|
|
293
|
fi
|
|
|
294
|
fi
|
|
|
295
|
fi
|
|
|
296
|
fi
|
|
|
297
|
|
|
|
298
|
# Secondary fallback with cooldown: restart boltd if interface is still missing
|
|
|
299
|
# and the host actually uses that service.
|
|
|
300
|
if ! has_tb_netdev; then
|
|
|
301
|
if [ $((now - last)) -ge "$BOLT_RESTART_COOLDOWN_SEC" ]; then
|
|
|
302
|
if systemctl list-unit-files bolt.service >/dev/null 2>&1; then
|
|
|
303
|
systemctl restart bolt.service || true
|
|
|
304
|
printf '%s\n' "$now" > "$LAST_BOLT_RESTART_FILE"
|
|
|
305
|
fi
|
|
|
306
|
fi
|
|
|
307
|
fi
|
|
|
308
|
|
|
|
309
|
trigger_tb_rescan
|
|
|
310
|
fi
|
|
|
311
|
fi
|
|
|
312
|
|
|
|
313
|
for path in /sys/class/net/thunderbolt*; do
|
|
|
314
|
[ -e "$path" ] || continue
|
|
|
315
|
assess_peer_health "${path##*/}"
|
|
|
316
|
done
|