1 contributor
#!/bin/bash
# autoSMART Cluster Monitor
# Version: 1.0
# Description: Monitor autoSMART services across Proxmox cluster
# Configuration
CLUSTER_JSON="$(dirname "$0")/../cluster.json"
NODES=()
NODE_IPS=()
if [[ -f "$CLUSTER_JSON" ]] && command -v jq &> /dev/null; then
while IFS= read -r node; do
NODES+=("$(echo "$node" | jq -r '.hostname')")
NODE_IPS+=("$(echo "$node" | jq -r '.ip')")
done < <(jq -c '.cluster.nodes[]' "$CLUSTER_JSON")
fi
DB_HOST="192.168.2.102"
DB_USER="autosmart"
DB_PASS="autoSMART2025!"
DB_NAME="autosmart"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_header() {
echo -e "${CYAN}$1${NC}"
}
show_usage() {
echo "autoSMART Cluster Monitor v1.0"
echo ""
echo "Usage: $0 [COMMAND] [OPTIONS]"
echo ""
echo "Commands:"
echo " status Show service status on all nodes"
echo " logs [NODE] Show recent logs (all nodes or specific node)"
echo " start Start services on all nodes"
echo " stop Stop services on all nodes"
echo " restart Restart services on all nodes"
echo " deploy Deploy autoSMART to all nodes"
echo " database Show database statistics"
echo " health Show cluster health summary"
echo " collect Force immediate SMART collection on all nodes"
echo ""
echo "Options:"
echo " --node NODE Target specific node (name from cluster.json)"
echo " --watch Continuous monitoring (refresh every 10s)"
echo " --verbose Show detailed output"
echo ""
echo "Examples:"
echo " $0 status # Show status on all nodes"
echo " $0 status --node <node> # Show status on node from cluster.json"
echo " $0 logs <node> # Show logs from node in cluster.json"
echo " $0 health --watch # Continuous health monitoring"
echo " $0 deploy # Deploy to all nodes"
echo ""
}
check_node_connectivity() {
local node=$1
local ip=$2
if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
return 0
else
return 1
fi
}
show_service_status() {
local target_node=$1
log_header "๐ autoSMART Service Status"
log_header "============================="
for i in "${!NODES[@]}"; do
local node="${NODES[$i]}"
local ip="${NODE_IPS[$i]}"
# Skip if specific node requested and this isn't it
if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
continue
fi
echo ""
log_info "Node: $node ($NODE_IP_BASE.$ip)"
echo "----------------------------------------"
if check_node_connectivity "$node" "$ip"; then
local status_output
status_output=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NODE_IP_BASE.$ip" \
"systemctl is-active autosmart 2>/dev/null || echo 'inactive'; \
systemctl is-enabled autosmart 2>/dev/null || echo 'disabled'; \
uptime | awk '{print \$3, \$4}' | sed 's/,//'" 2>/dev/null)
if [[ $? -eq 0 ]]; then
local active=$(echo "$status_output" | sed -n '1p')
local enabled=$(echo "$status_output" | sed -n '2p')
local uptime=$(echo "$status_output" | sed -n '3p')
echo -n " Status: "
if [[ "$active" == "active" ]]; then
log_success "RUNNING"
else
log_error "NOT RUNNING"
fi
echo -n " Enabled: "
if [[ "$enabled" == "enabled" ]]; then
log_success "YES"
else
log_warning "NO"
fi
echo " Uptime: $uptime"
# Get recent activity
local last_log=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
"journalctl -u autosmart --no-pager -n 1 --output=short-iso 2>/dev/null | tail -1" 2>/dev/null)
if [[ -n "$last_log" ]]; then
echo " Last Activity: $(echo "$last_log" | awk '{print $1, $2}')"
fi
else
log_error "SSH CONNECTION FAILED"
fi
else
log_error "NETWORK UNREACHABLE"
fi
done
}
show_logs() {
local target_node=$1
local lines=${2:-20}
log_header "๐ Recent Logs"
log_header "==============="
for i in "${!NODES[@]}"; do
local node="${NODES[$i]}"
local ip="${NODE_IPS[$i]}"
# Skip if specific node requested and this isn't it
if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
continue
fi
echo ""
log_info "Node: $node ($NODE_IP_BASE.$ip)"
echo "----------------------------------------"
if check_node_connectivity "$node" "$ip"; then
ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
"journalctl -u autosmart --no-pager -n $lines --output=short-iso 2>/dev/null || echo 'No logs available'" 2>/dev/null
else
log_error "Node unreachable"
fi
done
}
control_services() {
local action=$1
local target_node=$2
log_header "๐ง ${action^} Services"
log_header "==================="
for i in "${!NODES[@]}"; do
local node="${NODES[$i]}"
local ip="${NODE_IPS[$i]}"
# Skip if specific node requested and this isn't it
if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
continue
fi
echo ""
log_info "Node: $node - ${action}ing autosmart service..."
if check_node_connectivity "$node" "$ip"; then
if ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" "systemctl $action autosmart" 2>/dev/null; then
log_success "$node: Service ${action}ed successfully"
else
log_error "$node: Failed to $action service"
fi
else
log_error "$node: Node unreachable"
fi
done
}
show_database_stats() {
log_header "๐ Database Statistics"
log_header "====================="
if command -v psql &> /dev/null; then
echo ""
log_info "Connection: $DB_HOST:5432/$DB_NAME"
echo ""
# Test connection
if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
log_success "Database connection: OK"
echo ""
# Get statistics
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
SELECT
'Total Drives' as metric, COUNT(DISTINCT serial_number)::text as value
FROM hdd_inventory
UNION ALL
SELECT
'Active Nodes', COUNT(DISTINCT current_node_id)::text
FROM hdd_inventory WHERE last_seen > NOW() - INTERVAL '1 hour'
UNION ALL
SELECT
'Total Readings', COUNT(*)::text
FROM smart_readings
UNION ALL
SELECT
'Readings Today', COUNT(*)::text
FROM smart_readings WHERE timestamp > CURRENT_DATE
UNION ALL
SELECT
'Latest Reading', MAX(timestamp)::text
FROM smart_readings;
" 2>/dev/null
echo ""
log_info "Storage Efficiency:"
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
SELECT
hi.serial_number,
hi.model_name,
COUNT(sr.id) as readings,
COUNT(DISTINCT sr.parameters_json) as unique_sets,
CASE
WHEN COUNT(DISTINCT sr.parameters_json) > 0
THEN ROUND((1 - COUNT(DISTINCT sr.parameters_json)::decimal / COUNT(sr.id)) * 100, 1)
ELSE 0
END as savings_percent
FROM hdd_inventory hi
LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id
GROUP BY hi.id, hi.serial_number, hi.model_name
HAVING COUNT(sr.id) > 0
ORDER BY readings DESC;
" 2>/dev/null
else
log_error "Database connection failed"
log_info "Please check:"
log_info " โข PostgreSQL server is running on $DB_HOST"
log_info " โข Database '$DB_NAME' exists"
log_info " โข User '$DB_USER' has proper permissions"
fi
else
log_warning "psql not installed. Cannot check database statistics."
fi
}
show_cluster_health() {
local watch_mode=$1
while true; do
clear
log_header "๐ฅ Cluster Health Summary"
log_header "========================="
echo "Last Update: $(date)"
echo ""
# Service status summary
local total_nodes=0
local active_nodes=0
local enabled_nodes=0
for i in "${!NODES[@]}"; do
local node="${NODES[$i]}"
local ip="${NODE_IPS[$i]}"
if check_node_connectivity "$node" "$ip"; then
((total_nodes++))
local status=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
"systemctl is-active autosmart 2>/dev/null" 2>/dev/null)
local enabled=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
"systemctl is-enabled autosmart 2>/dev/null" 2>/dev/null)
if [[ "$status" == "active" ]]; then
((active_nodes++))
fi
if [[ "$enabled" == "enabled" ]]; then
((enabled_nodes++))
fi
fi
done
echo "๐ก Cluster Status:"
echo " โข Total Nodes: $total_nodes/${#NODES[@]}"
echo " โข Active Services: $active_nodes/$total_nodes"
echo " โข Enabled Services: $enabled_nodes/$total_nodes"
echo ""
# Quick database check
if command -v psql &> /dev/null; then
if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
local db_stats=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -t -c "
SELECT
COUNT(DISTINCT serial_number) || '|' ||
COUNT(DISTINCT current_node_id) || '|' ||
COUNT(*) || '|' ||
MAX(timestamp)
FROM hdd_inventory hi
LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id;
" 2>/dev/null | xargs)
IFS='|' read -r drives nodes readings latest <<< "$db_stats"
echo "๐๏ธ Database Status:"
echo " โข Connection: OK"
echo " โข Drives Tracked: $drives"
echo " โข Active Nodes: $nodes"
echo " โข Total Readings: $readings"
echo " โข Latest Reading: $(echo "$latest" | cut -d'.' -f1)"
else
echo "๐๏ธ Database Status: โ CONNECTION FAILED"
fi
fi
if [[ "$watch_mode" != "watch" ]]; then
break
fi
echo ""
echo "Press Ctrl+C to exit watch mode..."
sleep 10
done
}
deploy_cluster() {
log_header "๐ Cluster Deployment"
log_header "==================="
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
local deploy_script="$script_dir/deploy-production.sh"
if [[ -f "$deploy_script" ]]; then
log_info "Running cluster deployment script..."
bash "$deploy_script"
else
log_error "Deployment script not found: $deploy_script"
log_info "Deploying manually to each node..."
for i in "${!NODES[@]}"; do
local node="${NODES[$i]}"
local ip="${NODE_IPS[$i]}"
echo ""
log_info "Deploying to $node ($NODE_IP_BASE.$ip)..."
if check_node_connectivity "$node" "$ip"; then
# Copy autoSMART files
scp -r "$(dirname "$script_dir")"/* "root@$NODE_IP_BASE.$ip:/tmp/autosmart-deploy/" 2>/dev/null
# Run installation
ssh "root@$NODE_IP_BASE.$ip" "cd /tmp/autosmart-deploy/scripts && bash deploy.sh install --force-reinstall --node-id $node" 2>/dev/null
if [[ $? -eq 0 ]]; then
log_success "$node: Deployment successful"
else
log_error "$node: Deployment failed"
fi
else
log_error "$node: Node unreachable"
fi
done
fi
}
force_collection() {
log_header "๐ Force SMART Collection"
log_header "========================="
for i in "${!NODES[@]}"; do
local node="${NODES[$i]}"
local ip="${NODE_IPS[$i]}"
echo ""
log_info "Node: $node - Triggering SMART collection..."
if check_node_connectivity "$node" "$ip"; then
# Send SIGHUP to daemon to trigger immediate collection
ssh "root@$NODE_IP_BASE.$ip" "pkill -HUP -f smart-collector-daemon || systemctl reload autosmart" 2>/dev/null
if [[ $? -eq 0 ]]; then
log_success "$node: Collection triggered"
else
log_warning "$node: Signal sent, check service status"
fi
else
log_error "$node: Node unreachable"
fi
done
}
# Parse command line arguments
COMMAND=""
TARGET_NODE=""
WATCH_MODE=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case $1 in
status|logs|start|stop|restart|deploy|database|health|collect)
COMMAND="$1"
shift
;;
--node)
TARGET_NODE="$2"
shift 2
;;
--watch)
WATCH_MODE=true
shift
;;
--verbose)
VERBOSE=true
shift
;;
--help)
show_usage
exit 0
;;
ebony|ivory|obsidian)
# Allow node names as direct arguments for logs command
if [[ "$COMMAND" == "logs" ]]; then
TARGET_NODE="$1"
fi
shift
;;
*)
if [[ -z "$COMMAND" ]]; then
COMMAND="$1"
else
log_error "Unknown option: $1"
show_usage
exit 1
fi
shift
;;
esac
done
# Default command
if [[ -z "$COMMAND" ]]; then
COMMAND="status"
fi
# Execute command
case "$COMMAND" in
status)
if [[ "$WATCH_MODE" == true ]]; then
while true; do
clear
show_service_status "$TARGET_NODE"
echo ""
echo "Press Ctrl+C to exit watch mode..."
sleep 10
done
else
show_service_status "$TARGET_NODE"
fi
;;
logs)
show_logs "$TARGET_NODE"
;;
start|stop|restart)
control_services "$COMMAND" "$TARGET_NODE"
;;
database)
show_database_stats
;;
health)
show_cluster_health "$([[ "$WATCH_MODE" == true ]] && echo "watch")"
;;
deploy)
deploy_cluster
;;
collect)
force_collection
;;
*)
log_error "Unknown command: $COMMAND"
show_usage
exit 1
;;
esac
exit 0