#!/bin/bash # autoSMART Cluster Monitor # Version: 1.0 # Description: Monitor autoSMART services across Proxmox cluster # Configuration CLUSTER_JSON="$(dirname "$0")/../cluster.json" NODES=() NODE_IPS=() if [[ -f "$CLUSTER_JSON" ]] && command -v jq &> /dev/null; then while IFS= read -r node; do NODES+=("$(echo "$node" | jq -r '.hostname')") NODE_IPS+=("$(echo "$node" | jq -r '.ip')") done < <(jq -c '.cluster.nodes[]' "$CLUSTER_JSON") fi DB_HOST="192.168.2.102" DB_USER="autosmart" DB_PASS="autoSMART2025!" DB_NAME="autosmart" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } log_header() { echo -e "${CYAN}$1${NC}" } show_usage() { echo "autoSMART Cluster Monitor v1.0" echo "" echo "Usage: $0 [COMMAND] [OPTIONS]" echo "" echo "Commands:" echo " status Show service status on all nodes" echo " logs [NODE] Show recent logs (all nodes or specific node)" echo " start Start services on all nodes" echo " stop Stop services on all nodes" echo " restart Restart services on all nodes" echo " deploy Deploy autoSMART to all nodes" echo " database Show database statistics" echo " health Show cluster health summary" echo " collect Force immediate SMART collection on all nodes" echo "" echo "Options:" echo " --node NODE Target specific node (name from cluster.json)" echo " --watch Continuous monitoring (refresh every 10s)" echo " --verbose Show detailed output" echo "" echo "Examples:" echo " $0 status # Show status on all nodes" echo " $0 status --node # Show status on node from cluster.json" echo " $0 logs # Show logs from node in cluster.json" echo " $0 health --watch # Continuous health monitoring" echo " $0 deploy # Deploy to all nodes" echo "" } check_node_connectivity() { local node=$1 local ip=$2 if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then return 0 else return 1 fi } show_service_status() { local target_node=$1 log_header "🔍 autoSMART Service Status" log_header "=============================" for i in "${!NODES[@]}"; do local node="${NODES[$i]}" local ip="${NODE_IPS[$i]}" # Skip if specific node requested and this isn't it if [[ -n "$target_node" && "$node" != "$target_node" ]]; then continue fi echo "" log_info "Node: $node ($NODE_IP_BASE.$ip)" echo "----------------------------------------" if check_node_connectivity "$node" "$ip"; then local status_output status_output=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NODE_IP_BASE.$ip" \ "systemctl is-active autosmart 2>/dev/null || echo 'inactive'; \ systemctl is-enabled autosmart 2>/dev/null || echo 'disabled'; \ uptime | awk '{print \$3, \$4}' | sed 's/,//'" 2>/dev/null) if [[ $? -eq 0 ]]; then local active=$(echo "$status_output" | sed -n '1p') local enabled=$(echo "$status_output" | sed -n '2p') local uptime=$(echo "$status_output" | sed -n '3p') echo -n " Status: " if [[ "$active" == "active" ]]; then log_success "RUNNING" else log_error "NOT RUNNING" fi echo -n " Enabled: " if [[ "$enabled" == "enabled" ]]; then log_success "YES" else log_warning "NO" fi echo " Uptime: $uptime" # Get recent activity local last_log=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \ "journalctl -u autosmart --no-pager -n 1 --output=short-iso 2>/dev/null | tail -1" 2>/dev/null) if [[ -n "$last_log" ]]; then echo " Last Activity: $(echo "$last_log" | awk '{print $1, $2}')" fi else log_error "SSH CONNECTION FAILED" fi else log_error "NETWORK UNREACHABLE" fi done } show_logs() { local target_node=$1 local lines=${2:-20} log_header "📋 Recent Logs" log_header "===============" for i in "${!NODES[@]}"; do local node="${NODES[$i]}" local ip="${NODE_IPS[$i]}" # Skip if specific node requested and this isn't it if [[ -n "$target_node" && "$node" != "$target_node" ]]; then continue fi echo "" log_info "Node: $node ($NODE_IP_BASE.$ip)" echo "----------------------------------------" if check_node_connectivity "$node" "$ip"; then ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \ "journalctl -u autosmart --no-pager -n $lines --output=short-iso 2>/dev/null || echo 'No logs available'" 2>/dev/null else log_error "Node unreachable" fi done } control_services() { local action=$1 local target_node=$2 log_header "🔧 ${action^} Services" log_header "===================" for i in "${!NODES[@]}"; do local node="${NODES[$i]}" local ip="${NODE_IPS[$i]}" # Skip if specific node requested and this isn't it if [[ -n "$target_node" && "$node" != "$target_node" ]]; then continue fi echo "" log_info "Node: $node - ${action}ing autosmart service..." if check_node_connectivity "$node" "$ip"; then if ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" "systemctl $action autosmart" 2>/dev/null; then log_success "$node: Service ${action}ed successfully" else log_error "$node: Failed to $action service" fi else log_error "$node: Node unreachable" fi done } show_database_stats() { log_header "📊 Database Statistics" log_header "=====================" if command -v psql &> /dev/null; then echo "" log_info "Connection: $DB_HOST:5432/$DB_NAME" echo "" # Test connection if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then log_success "Database connection: OK" echo "" # Get statistics PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c " SELECT 'Total Drives' as metric, COUNT(DISTINCT serial_number)::text as value FROM hdd_inventory UNION ALL SELECT 'Active Nodes', COUNT(DISTINCT current_node_id)::text FROM hdd_inventory WHERE last_seen > NOW() - INTERVAL '1 hour' UNION ALL SELECT 'Total Readings', COUNT(*)::text FROM smart_readings UNION ALL SELECT 'Readings Today', COUNT(*)::text FROM smart_readings WHERE timestamp > CURRENT_DATE UNION ALL SELECT 'Latest Reading', MAX(timestamp)::text FROM smart_readings; " 2>/dev/null echo "" log_info "Storage Efficiency:" PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c " SELECT hi.serial_number, hi.model_name, COUNT(sr.id) as readings, COUNT(DISTINCT sr.parameters_json) as unique_sets, CASE WHEN COUNT(DISTINCT sr.parameters_json) > 0 THEN ROUND((1 - COUNT(DISTINCT sr.parameters_json)::decimal / COUNT(sr.id)) * 100, 1) ELSE 0 END as savings_percent FROM hdd_inventory hi LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id GROUP BY hi.id, hi.serial_number, hi.model_name HAVING COUNT(sr.id) > 0 ORDER BY readings DESC; " 2>/dev/null else log_error "Database connection failed" log_info "Please check:" log_info " • PostgreSQL server is running on $DB_HOST" log_info " • Database '$DB_NAME' exists" log_info " • User '$DB_USER' has proper permissions" fi else log_warning "psql not installed. Cannot check database statistics." fi } show_cluster_health() { local watch_mode=$1 while true; do clear log_header "🏥 Cluster Health Summary" log_header "=========================" echo "Last Update: $(date)" echo "" # Service status summary local total_nodes=0 local active_nodes=0 local enabled_nodes=0 for i in "${!NODES[@]}"; do local node="${NODES[$i]}" local ip="${NODE_IPS[$i]}" if check_node_connectivity "$node" "$ip"; then ((total_nodes++)) local status=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \ "systemctl is-active autosmart 2>/dev/null" 2>/dev/null) local enabled=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \ "systemctl is-enabled autosmart 2>/dev/null" 2>/dev/null) if [[ "$status" == "active" ]]; then ((active_nodes++)) fi if [[ "$enabled" == "enabled" ]]; then ((enabled_nodes++)) fi fi done echo "📡 Cluster Status:" echo " • Total Nodes: $total_nodes/${#NODES[@]}" echo " • Active Services: $active_nodes/$total_nodes" echo " • Enabled Services: $enabled_nodes/$total_nodes" echo "" # Quick database check if command -v psql &> /dev/null; then if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then local db_stats=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -t -c " SELECT COUNT(DISTINCT serial_number) || '|' || COUNT(DISTINCT current_node_id) || '|' || COUNT(*) || '|' || MAX(timestamp) FROM hdd_inventory hi LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id; " 2>/dev/null | xargs) IFS='|' read -r drives nodes readings latest <<< "$db_stats" echo "🗄️ Database Status:" echo " • Connection: OK" echo " • Drives Tracked: $drives" echo " • Active Nodes: $nodes" echo " • Total Readings: $readings" echo " • Latest Reading: $(echo "$latest" | cut -d'.' -f1)" else echo "🗄️ Database Status: ❌ CONNECTION FAILED" fi fi if [[ "$watch_mode" != "watch" ]]; then break fi echo "" echo "Press Ctrl+C to exit watch mode..." sleep 10 done } deploy_cluster() { log_header "🚀 Cluster Deployment" log_header "===================" local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" local deploy_script="$script_dir/deploy-production.sh" if [[ -f "$deploy_script" ]]; then log_info "Running cluster deployment script..." bash "$deploy_script" else log_error "Deployment script not found: $deploy_script" log_info "Deploying manually to each node..." for i in "${!NODES[@]}"; do local node="${NODES[$i]}" local ip="${NODE_IPS[$i]}" echo "" log_info "Deploying to $node ($NODE_IP_BASE.$ip)..." if check_node_connectivity "$node" "$ip"; then # Copy autoSMART files scp -r "$(dirname "$script_dir")"/* "root@$NODE_IP_BASE.$ip:/tmp/autosmart-deploy/" 2>/dev/null # Run installation ssh "root@$NODE_IP_BASE.$ip" "cd /tmp/autosmart-deploy/scripts && bash deploy.sh install --force-reinstall --node-id $node" 2>/dev/null if [[ $? -eq 0 ]]; then log_success "$node: Deployment successful" else log_error "$node: Deployment failed" fi else log_error "$node: Node unreachable" fi done fi } force_collection() { log_header "🔄 Force SMART Collection" log_header "=========================" for i in "${!NODES[@]}"; do local node="${NODES[$i]}" local ip="${NODE_IPS[$i]}" echo "" log_info "Node: $node - Triggering SMART collection..." if check_node_connectivity "$node" "$ip"; then # Send SIGHUP to daemon to trigger immediate collection ssh "root@$NODE_IP_BASE.$ip" "pkill -HUP -f smart-collector-daemon || systemctl reload autosmart" 2>/dev/null if [[ $? -eq 0 ]]; then log_success "$node: Collection triggered" else log_warning "$node: Signal sent, check service status" fi else log_error "$node: Node unreachable" fi done } # Parse command line arguments COMMAND="" TARGET_NODE="" WATCH_MODE=false VERBOSE=false while [[ $# -gt 0 ]]; do case $1 in status|logs|start|stop|restart|deploy|database|health|collect) COMMAND="$1" shift ;; --node) TARGET_NODE="$2" shift 2 ;; --watch) WATCH_MODE=true shift ;; --verbose) VERBOSE=true shift ;; --help) show_usage exit 0 ;; ebony|ivory|obsidian) # Allow node names as direct arguments for logs command if [[ "$COMMAND" == "logs" ]]; then TARGET_NODE="$1" fi shift ;; *) if [[ -z "$COMMAND" ]]; then COMMAND="$1" else log_error "Unknown option: $1" show_usage exit 1 fi shift ;; esac done # Default command if [[ -z "$COMMAND" ]]; then COMMAND="status" fi # Execute command case "$COMMAND" in status) if [[ "$WATCH_MODE" == true ]]; then while true; do clear show_service_status "$TARGET_NODE" echo "" echo "Press Ctrl+C to exit watch mode..." sleep 10 done else show_service_status "$TARGET_NODE" fi ;; logs) show_logs "$TARGET_NODE" ;; start|stop|restart) control_services "$COMMAND" "$TARGET_NODE" ;; database) show_database_stats ;; health) show_cluster_health "$([[ "$WATCH_MODE" == true ]] && echo "watch")" ;; deploy) deploy_cluster ;; collect) force_collection ;; *) log_error "Unknown command: $COMMAND" show_usage exit 1 ;; esac exit 0