f16725e 3 months ago History
1 contributor
316 lines | 7.902kb
#!/usr/bin/env bash
set -euo pipefail

BRIDGE="thunderbridge"
MTU="65520"
FOUND_TB_IFACE=0
STATE_DIR="/run/tb-recover"
LAST_BOLT_RESTART_FILE="${STATE_DIR}/last_bolt_restart_epoch"
BOLT_RESTART_COOLDOWN_SEC=600
LAST_NHI_RESCAN_FILE="${STATE_DIR}/last_nhi_rescan_epoch"
NHI_RESCAN_COOLDOWN_SEC=600
NHI_SETTLE_SEC=8
PEER_FAIL_THRESHOLD="${TB_PEER_FAIL_THRESHOLD:-2}"
IFACE_CYCLE_COOLDOWN_SEC="${TB_IFACE_CYCLE_COOLDOWN_SEC:-300}"
IFACE_CYCLE_SETTLE_SEC="${TB_IFACE_CYCLE_SETTLE_SEC:-5}"
PING_TIMEOUT_SEC="${TB_PING_TIMEOUT_SEC:-1}"
LOCAL_HOST="$(hostname -s 2>/dev/null || hostname)"

mkdir -p "$STATE_DIR"

log() {
  printf '%s %s\n' "$(date -Is)" "$*"
}

command_exists() {
  command -v "$1" >/dev/null 2>&1
}

counter_file_for_iface() {
  printf '%s/peer-fail-%s.count\n' "$STATE_DIR" "$1"
}

cooldown_file_for_iface() {
  printf '%s/last-iface-cycle-%s.epoch\n' "$STATE_DIR" "$1"
}

read_epoch_file() {
  local file="$1"
  local value="0"

  if [ -f "$file" ]; then
    value="$(cat "$file" 2>/dev/null || echo 0)"
  fi

  case "$value" in
    ''|*[!0-9]*)
      value=0
      ;;
  esac

  printf '%s\n' "$value"
}

read_counter_file() {
  read_epoch_file "$1"
}

peer_ip_for_iface() {
  local iface="$1"

  case "${LOCAL_HOST}:${iface}" in
    baobab:thunderbolt0)
      printf '%s\n' "192.168.10.92"
      ;;
    baobab:thunderbolt1)
      printf '%s\n' "192.168.10.93"
      ;;
    ebony:thunderbolt0)
      printf '%s\n' "192.168.10.91"
      ;;
    tapia:thunderbolt0)
      printf '%s\n' "192.168.10.91"
      ;;
    *)
      return 1
      ;;
  esac
}

iface_is_forwarding() {
  local iface="$1"
  local state_file="/sys/class/net/${iface}/brport/state"

  [ -r "$state_file" ] || return 1
  [ "$(cat "$state_file" 2>/dev/null || echo 0)" = "3" ]
}

iface_is_oper_up() {
  local iface="$1"
  local operstate_file="/sys/class/net/${iface}/operstate"

  [ -r "$operstate_file" ] || return 1
  [ "$(cat "$operstate_file" 2>/dev/null || true)" = "up" ]
}

probe_peer_ip() {
  local peer_ip="$1"

  ip neigh del "$peer_ip" dev "$BRIDGE" 2>/dev/null || true
  ping -I "$BRIDGE" -n -c 1 -W "$PING_TIMEOUT_SEC" "$peer_ip" >/dev/null 2>&1
}

recover_iface_cycle() {
  local iface="$1"
  local peer_ip="$2"
  local now
  local last_cycle
  local cooldown_file

  now="$(date +%s)"
  cooldown_file="$(cooldown_file_for_iface "$iface")"
  last_cycle="$(read_epoch_file "$cooldown_file")"
  if [ $((now - last_cycle)) -lt "$IFACE_CYCLE_COOLDOWN_SEC" ]; then
    log "peer ${peer_ip} still unhealthy on ${iface}, but iface cycle is cooling down"
    return 0
  fi

  log "peer ${peer_ip} unhealthy on ${iface}; cycling link with ifdown/ifup"
  if command_exists ifdown && command_exists ifup; then
    ifdown --force "$iface" || log "ifdown reported a non-zero exit code for ${iface}"
    sleep 2
    if ! ifup "$iface"; then
      log "ifup failed for ${iface}"
      return 1
    fi
  else
    log "ifdown/ifup unavailable; falling back to ip link bounce for ${iface}"
    ip link set "$iface" down || true
    sleep 2
    ip link set "$iface" up || true
  fi

  ip link set "$iface" mtu "$MTU" || true
  ip link set "$iface" master "$BRIDGE" || true
  systemctl start "tb-enlist@${iface}.service" || true
  printf '%s\n' "$now" > "$cooldown_file"
  rm -f "$(counter_file_for_iface "$iface")"
  sleep "$IFACE_CYCLE_SETTLE_SEC"
}

assess_peer_health() {
  local iface="$1"
  local peer_ip=""
  local counter_file=""
  local fail_count=0

  if ! peer_ip="$(peer_ip_for_iface "$iface")"; then
    return 0
  fi

  counter_file="$(counter_file_for_iface "$iface")"

  if ! iface_is_oper_up "$iface" || ! iface_is_forwarding "$iface"; then
    rm -f "$counter_file"
    return 0
  fi

  if probe_peer_ip "$peer_ip"; then
    rm -f "$counter_file"
    return 0
  fi

  fail_count="$(read_counter_file "$counter_file")"
  fail_count=$((fail_count + 1))
  printf '%s\n' "$fail_count" > "$counter_file"
  log "peer probe failed on ${iface} towards ${peer_ip} (${fail_count}/${PEER_FAIL_THRESHOLD})"

  if [ "$fail_count" -lt "$PEER_FAIL_THRESHOLD" ]; then
    return 0
  fi

  recover_iface_cycle "$iface" "$peer_ip"
}

has_tb_netdev() {
  ls /sys/class/net/thunderbolt* >/dev/null 2>&1
}

has_stale_tb_xdomain() {
  local dev=""
  for dev in /sys/bus/thunderbolt/devices/[0-9]-[1-9]*; do
    [ -e "$dev" ] || continue
    case "${dev##*/}" in
      *.*|*:*)
        continue
        ;;
    esac

    if ! ls "${dev}".* >/dev/null 2>&1; then
      return 0
    fi
  done

  return 1
}

trigger_tb_rescan() {
  local domain=""
  for domain in /sys/bus/thunderbolt/devices/domain*; do
    [ -e "$domain/rescan" ] && echo 1 > "$domain/rescan" || true
  done

  udevadm trigger --subsystem-match=thunderbolt --action=change || true
  udevadm trigger --subsystem-match=net --action=add || true
}

run_nhi_rescan() {
  local epoch="$1"
  local dev=""
  local cls=""
  local drv=""
  local nhi_pci=""

  for dev in /sys/bus/pci/devices/*; do
    [ -e "$dev/class" ] || continue
    [ -e "$dev/driver" ] || continue
    [ -w "$dev/remove" ] || continue
    cls="$(cat "$dev/class" 2>/dev/null || true)"
    drv="$(basename "$(readlink -f "$dev/driver" 2>/dev/null || true)")"
    if [ "$cls" = "0x088000" ] && [ "$drv" = "thunderbolt" ]; then
      nhi_pci="$dev"
      break
    fi
  done

  if [ -n "$nhi_pci" ]; then
    echo 1 > "$nhi_pci/remove" || true
    sleep 1
    echo 1 > /sys/bus/pci/rescan || true
    printf '%s\n' "$epoch" > "$LAST_NHI_RESCAN_FILE"
    return 0
  fi

  return 1
}

# Keep the bridge present and up before trying to enslave ports.
ip link show "$BRIDGE" >/dev/null 2>&1 || ip link add name "$BRIDGE" type bridge || true
ip link set "$BRIDGE" mtu "$MTU" || true
ip link set "$BRIDGE" up || true

for path in /sys/class/net/thunderbolt*; do
  [ -e "$path" ] || continue
  IFACE="${path##*/}"
  FOUND_TB_IFACE=1
  ip link set "$IFACE" up || true
  ip link set "$IFACE" mtu "$MTU" || true
  ip link set "$IFACE" master "$BRIDGE" || true
  systemctl start "tb-enlist@${IFACE}.service" || true
done

# If no thunderbolt netdev exists but a TB domain exists, force a rescan + udev retrigger.
if [ "$FOUND_TB_IFACE" -eq 0 ] && [ -d /sys/bus/thunderbolt/devices ]; then
  trigger_tb_rescan

  # Escalate with cooldown: try PCI NHI remove+rescan to emulate a soft replug.
  sleep 2
  if ! has_tb_netdev; then
    now="$(date +%s)"
    last="0"
    if [ -f "$LAST_BOLT_RESTART_FILE" ]; then
      last="$(cat "$LAST_BOLT_RESTART_FILE" 2>/dev/null || echo 0)"
    fi

    case "$last" in
      ''|*[!0-9]*)
        last=0
        ;;
    esac

    nhi_last="0"
    if [ -f "$LAST_NHI_RESCAN_FILE" ]; then
      nhi_last="$(cat "$LAST_NHI_RESCAN_FILE" 2>/dev/null || echo 0)"
    fi
    case "$nhi_last" in
      ''|*[!0-9]*)
        nhi_last=0
        ;;
    esac

    if [ $((now - nhi_last)) -ge "$NHI_RESCAN_COOLDOWN_SEC" ]; then
      if run_nhi_rescan "$now"; then
        sleep "$NHI_SETTLE_SEC"
        trigger_tb_rescan

        # On newer kernels the first NHI reset can stop at the peer xdomain host
        # node without recreating the matching *.0 network service.
        if ! has_tb_netdev && has_stale_tb_xdomain; then
          retry_now="$(date +%s)"
          if run_nhi_rescan "$retry_now"; then
            sleep "$NHI_SETTLE_SEC"
            trigger_tb_rescan
          fi
        fi
      fi
    fi

    # Secondary fallback with cooldown: restart boltd if interface is still missing
    # and the host actually uses that service.
    if ! has_tb_netdev; then
      if [ $((now - last)) -ge "$BOLT_RESTART_COOLDOWN_SEC" ]; then
        if systemctl list-unit-files bolt.service >/dev/null 2>&1; then
          systemctl restart bolt.service || true
          printf '%s\n' "$now" > "$LAST_BOLT_RESTART_FILE"
        fi
      fi
    fi

    trigger_tb_rescan
  fi
fi

for path in /sys/class/net/thunderbolt*; do
  [ -e "$path" ] || continue
  assess_peer_health "${path##*/}"
done