Newer Older
325 lines | 8.395kb
Bogdan Timofte authored 3 months ago
1
#!/usr/bin/env bash
2
set -euo pipefail
3

            
4
BRIDGE="thunderbridge"
5
MTU="65520"
6
STATE_DIR="/run/tb-recover"
7
LAST_BOLT_RESTART_FILE="${STATE_DIR}/last_bolt_restart_epoch"
8
BOLT_RESTART_COOLDOWN_SEC=600
9
LAST_NHI_RESCAN_FILE="${STATE_DIR}/last_nhi_rescan_epoch"
10
NHI_RESCAN_COOLDOWN_SEC=600
11
NHI_SETTLE_SEC=8
12
PEER_FAIL_THRESHOLD="${TB_PEER_FAIL_THRESHOLD:-2}"
13
IFACE_CYCLE_COOLDOWN_SEC="${TB_IFACE_CYCLE_COOLDOWN_SEC:-300}"
14
IFACE_CYCLE_SETTLE_SEC="${TB_IFACE_CYCLE_SETTLE_SEC:-5}"
15
PING_TIMEOUT_SEC="${TB_PING_TIMEOUT_SEC:-1}"
Bogdan Timofte authored 2 weeks ago
16
MONITOR_INTERVAL_SEC="${TB_MONITOR_INTERVAL_SEC:-60}"
Bogdan Timofte authored 3 months ago
17
LOCAL_HOST="$(hostname -s 2>/dev/null || hostname)"
18

            
19
mkdir -p "$STATE_DIR"
Bogdan Timofte authored 2 weeks ago
20
trap "log 'Shutting down tb-recover'; exit 0" SIGTERM SIGINT
Bogdan Timofte authored 3 months ago
21

            
22
log() {
23
  printf '%s %s\n' "$(date -Is)" "$*"
24
}
25

            
26
command_exists() {
27
  command -v "$1" >/dev/null 2>&1
28
}
29

            
30
counter_file_for_iface() {
31
  printf '%s/peer-fail-%s.count\n' "$STATE_DIR" "$1"
32
}
33

            
34
cooldown_file_for_iface() {
35
  printf '%s/last-iface-cycle-%s.epoch\n' "$STATE_DIR" "$1"
36
}
37

            
38
read_epoch_file() {
39
  local file="$1"
40
  local value="0"
41

            
42
  if [ -f "$file" ]; then
43
    value="$(cat "$file" 2>/dev/null || echo 0)"
44
  fi
45

            
46
  case "$value" in
47
    ''|*[!0-9]*)
48
      value=0
49
      ;;
50
  esac
51

            
52
  printf '%s\n' "$value"
53
}
54

            
55
read_counter_file() {
56
  read_epoch_file "$1"
57
}
58

            
59
peer_ip_for_iface() {
60
  local iface="$1"
61

            
Bogdan Timofte authored 2 weeks ago
62
  # Dynamically resolve peer by looking up the XDomain device name bound to this
63
  # interface.  The kernel exposes a symlink at /sys/class/net/<iface>/device
64
  # pointing to the XDomain service path (e.g. .../1-1.0).  Its parent directory
65
  # is the XDomain device whose device_name attribute holds the peer hostname.
66
  local dev_path xdomain_dev peer_name
67
  dev_path="$(readlink -f "/sys/class/net/${iface}/device" 2>/dev/null || true)"
68
  if [ -n "$dev_path" ] && [ -d "$dev_path" ]; then
69
    xdomain_dev="$(dirname "$dev_path")"
70
    peer_name="$(cat "${xdomain_dev}/device_name" 2>/dev/null || true)"
71
    case "$peer_name" in
72
      baobab) printf '%s\n' "192.168.10.91" ; return 0 ;;
73
      ebony)  printf '%s\n' "192.168.10.92" ; return 0 ;;
74
      tapia)  printf '%s\n' "192.168.10.93" ; return 0 ;;
75
    esac
76
  fi
77

            
78
  # Static fallback (used when sysfs path is not available).
Bogdan Timofte authored 3 months ago
79
  case "${LOCAL_HOST}:${iface}" in
80
    baobab:thunderbolt0)
81
      printf '%s\n' "192.168.10.92"
82
      ;;
83
    baobab:thunderbolt1)
84
      printf '%s\n' "192.168.10.93"
85
      ;;
86
    ebony:thunderbolt0)
87
      printf '%s\n' "192.168.10.91"
88
      ;;
89
    tapia:thunderbolt0)
90
      printf '%s\n' "192.168.10.91"
91
      ;;
92
    *)
93
      return 1
94
      ;;
95
  esac
96
}
97

            
98
iface_is_forwarding() {
99
  local iface="$1"
100
  local state_file="/sys/class/net/${iface}/brport/state"
101

            
102
  [ -r "$state_file" ] || return 1
103
  [ "$(cat "$state_file" 2>/dev/null || echo 0)" = "3" ]
104
}
105

            
106
iface_is_oper_up() {
107
  local iface="$1"
108
  local operstate_file="/sys/class/net/${iface}/operstate"
109

            
110
  [ -r "$operstate_file" ] || return 1
111
  [ "$(cat "$operstate_file" 2>/dev/null || true)" = "up" ]
112
}
113

            
114
probe_peer_ip() {
115
  local peer_ip="$1"
116

            
117
  ip neigh del "$peer_ip" dev "$BRIDGE" 2>/dev/null || true
118
  ping -I "$BRIDGE" -n -c 1 -W "$PING_TIMEOUT_SEC" "$peer_ip" >/dev/null 2>&1
119
}
120

            
121
recover_iface_cycle() {
122
  local iface="$1"
123
  local peer_ip="$2"
124
  local now
125
  local last_cycle
126
  local cooldown_file
127

            
128
  now="$(date +%s)"
129
  cooldown_file="$(cooldown_file_for_iface "$iface")"
130
  last_cycle="$(read_epoch_file "$cooldown_file")"
131
  if [ $((now - last_cycle)) -lt "$IFACE_CYCLE_COOLDOWN_SEC" ]; then
132
    log "peer ${peer_ip} still unhealthy on ${iface}, but iface cycle is cooling down"
133
    return 0
134
  fi
135

            
136
  log "peer ${peer_ip} unhealthy on ${iface}; cycling link with ifdown/ifup"
137
  if command_exists ifdown && command_exists ifup; then
138
    ifdown --force "$iface" || log "ifdown reported a non-zero exit code for ${iface}"
139
    sleep 2
140
    if ! ifup "$iface"; then
141
      log "ifup failed for ${iface}"
142
      return 1
143
    fi
144
  else
145
    log "ifdown/ifup unavailable; falling back to ip link bounce for ${iface}"
146
    ip link set "$iface" down || true
147
    sleep 2
148
    ip link set "$iface" up || true
149
  fi
150

            
151
  ip link set "$iface" mtu "$MTU" || true
152
  ip link set "$iface" master "$BRIDGE" || true
153
  systemctl start "tb-enlist@${iface}.service" || true
154
  printf '%s\n' "$now" > "$cooldown_file"
155
  rm -f "$(counter_file_for_iface "$iface")"
156
  sleep "$IFACE_CYCLE_SETTLE_SEC"
157
}
158

            
159
assess_peer_health() {
160
  local iface="$1"
161
  local peer_ip=""
162
  local counter_file=""
163
  local fail_count=0
164

            
165
  if ! peer_ip="$(peer_ip_for_iface "$iface")"; then
166
    return 0
167
  fi
168

            
169
  counter_file="$(counter_file_for_iface "$iface")"
170

            
171
  if ! iface_is_oper_up "$iface" || ! iface_is_forwarding "$iface"; then
172
    rm -f "$counter_file"
173
    return 0
174
  fi
175

            
176
  if probe_peer_ip "$peer_ip"; then
177
    rm -f "$counter_file"
178
    return 0
179
  fi
180

            
181
  fail_count="$(read_counter_file "$counter_file")"
182
  fail_count=$((fail_count + 1))
183
  printf '%s\n' "$fail_count" > "$counter_file"
184
  log "peer probe failed on ${iface} towards ${peer_ip} (${fail_count}/${PEER_FAIL_THRESHOLD})"
185

            
186
  if [ "$fail_count" -lt "$PEER_FAIL_THRESHOLD" ]; then
187
    return 0
188
  fi
189

            
190
  recover_iface_cycle "$iface" "$peer_ip"
191
}
192

            
193
has_tb_netdev() {
194
  ls /sys/class/net/thunderbolt* >/dev/null 2>&1
195
}
196

            
197
has_stale_tb_xdomain() {
198
  local dev=""
199
  for dev in /sys/bus/thunderbolt/devices/[0-9]-[1-9]*; do
200
    [ -e "$dev" ] || continue
201
    case "${dev##*/}" in
202
      *.*|*:*)
203
        continue
204
        ;;
205
    esac
206

            
207
    if ! ls "${dev}".* >/dev/null 2>&1; then
208
      return 0
209
    fi
210
  done
211

            
212
  return 1
213
}
214

            
215
trigger_tb_rescan() {
216
  local domain=""
217
  for domain in /sys/bus/thunderbolt/devices/domain*; do
218
    [ -e "$domain/rescan" ] && echo 1 > "$domain/rescan" || true
219
  done
220

            
221
  udevadm trigger --subsystem-match=thunderbolt --action=change || true
222
  udevadm trigger --subsystem-match=net --action=add || true
223
}
224

            
225
run_nhi_rescan() {
226
  local epoch="$1"
227
  local dev=""
228
  local cls=""
229
  local drv=""
230
  local nhi_pci=""
231

            
232
  for dev in /sys/bus/pci/devices/*; do
233
    [ -e "$dev/class" ] || continue
234
    [ -e "$dev/driver" ] || continue
235
    [ -w "$dev/remove" ] || continue
236
    cls="$(cat "$dev/class" 2>/dev/null || true)"
237
    drv="$(basename "$(readlink -f "$dev/driver" 2>/dev/null || true)")"
238
    if [ "$cls" = "0x088000" ] && [ "$drv" = "thunderbolt" ]; then
239
      nhi_pci="$dev"
240
      break
241
    fi
242
  done
243

            
244
  if [ -n "$nhi_pci" ]; then
245
    echo 1 > "$nhi_pci/remove" || true
246
    sleep 1
247
    echo 1 > /sys/bus/pci/rescan || true
248
    printf '%s\n' "$epoch" > "$LAST_NHI_RESCAN_FILE"
249
    return 0
250
  fi
251

            
252
  return 1
253
}
254

            
Bogdan Timofte authored 2 weeks ago
255
init_bridge() {
256
  ip link show "$BRIDGE" >/dev/null 2>&1 || ip link add name "$BRIDGE" type bridge || true
257
  ip link set "$BRIDGE" mtu "$MTU" || true
258
  ip link set "$BRIDGE" up || true
259
}
Bogdan Timofte authored 3 months ago
260

            
Bogdan Timofte authored 2 weeks ago
261
handle_missing_interfaces() {
262
  local found_tb_iface=0
263

            
264
  for path in /sys/class/net/thunderbolt*; do
265
    [ -e "$path" ] || continue
266
    found_tb_iface=1
267
    IFACE="${path##*/}"
268
    ip link set "$IFACE" up || true
269
    ip link set "$IFACE" mtu "$MTU" || true
270
    ip link set "$IFACE" master "$BRIDGE" || true
271
    systemctl start "tb-enlist@${IFACE}.service" || true
272
  done
Bogdan Timofte authored 3 months ago
273

            
Bogdan Timofte authored 2 weeks ago
274
  if [ "$found_tb_iface" -eq 0 ] && [ -d /sys/bus/thunderbolt/devices ]; then
275
    trigger_tb_rescan
276
    sleep 2
Bogdan Timofte authored 3 months ago
277

            
Bogdan Timofte authored 2 weeks ago
278
    if ! has_tb_netdev; then
279
      now="$(date +%s)"
280
      last="$(read_epoch_file "$LAST_BOLT_RESTART_FILE")"
281
      nhi_last="$(read_epoch_file "$LAST_NHI_RESCAN_FILE")"
282

            
283
      if [ $((now - nhi_last)) -ge "$NHI_RESCAN_COOLDOWN_SEC" ]; then
284
        if run_nhi_rescan "$now"; then
285
          sleep "$NHI_SETTLE_SEC"
286
          trigger_tb_rescan
287

            
288
          if ! has_tb_netdev && has_stale_tb_xdomain; then
289
            retry_now="$(date +%s)"
290
            if run_nhi_rescan "$retry_now"; then
291
              sleep "$NHI_SETTLE_SEC"
292
              trigger_tb_rescan
293
            fi
Bogdan Timofte authored 3 months ago
294
          fi
295
        fi
296
      fi
297

            
Bogdan Timofte authored 2 weeks ago
298
      if ! has_tb_netdev && [ $((now - last)) -ge "$BOLT_RESTART_COOLDOWN_SEC" ]; then
Bogdan Timofte authored 3 months ago
299
        if systemctl list-unit-files bolt.service >/dev/null 2>&1; then
300
          systemctl restart bolt.service || true
301
          printf '%s\n' "$now" > "$LAST_BOLT_RESTART_FILE"
302
        fi
303
      fi
304

            
Bogdan Timofte authored 2 weeks ago
305
      trigger_tb_rescan
306
    fi
Bogdan Timofte authored 3 months ago
307
  fi
Bogdan Timofte authored 2 weeks ago
308
}
309

            
310
monitor_interfaces() {
311
  for path in /sys/class/net/thunderbolt*; do
312
    [ -e "$path" ] || continue
313
    assess_peer_health "${path##*/}"
314
  done
315
}
316

            
317
init_bridge
318

            
319
log "tb-recover monitor started (interval: ${MONITOR_INTERVAL_SEC}s)"
Bogdan Timofte authored 3 months ago
320

            
Bogdan Timofte authored 2 weeks ago
321
while true; do
322
  handle_missing_interfaces
323
  monitor_interfaces
324
  sleep "$MONITOR_INTERVAL_SEC"
Bogdan Timofte authored 3 months ago
325
done