f16725e 3 months ago History
1 contributor
521 lines | 16.604kb
#!/bin/bash

# autoSMART Cluster Monitor
# Version: 1.0
# Description: Monitor autoSMART services across Proxmox cluster

# Configuration
CLUSTER_JSON="$(dirname "$0")/../cluster.json"
NODES=()
NODE_IPS=()
if [[ -f "$CLUSTER_JSON" ]] && command -v jq &> /dev/null; then
    while IFS= read -r node; do
        NODES+=("$(echo "$node" | jq -r '.hostname')")
        NODE_IPS+=("$(echo "$node" | jq -r '.ip')")
    done < <(jq -c '.cluster.nodes[]' "$CLUSTER_JSON")
fi
DB_HOST="192.168.2.102"
DB_USER="autosmart"
DB_PASS="autoSMART2025!"
DB_NAME="autosmart"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

log_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

log_header() {
    echo -e "${CYAN}$1${NC}"
}

show_usage() {
    echo "autoSMART Cluster Monitor v1.0"
    echo ""
    echo "Usage: $0 [COMMAND] [OPTIONS]"
    echo ""
    echo "Commands:"
    echo "  status                Show service status on all nodes"
    echo "  logs [NODE]          Show recent logs (all nodes or specific node)"
    echo "  start                Start services on all nodes"
    echo "  stop                 Stop services on all nodes"
    echo "  restart              Restart services on all nodes"
    echo "  deploy               Deploy autoSMART to all nodes"
    echo "  database             Show database statistics"
    echo "  health               Show cluster health summary"
    echo "  collect              Force immediate SMART collection on all nodes"
    echo ""
    echo "Options:"
    echo "  --node NODE          Target specific node (name from cluster.json)"
    echo "  --watch              Continuous monitoring (refresh every 10s)"
    echo "  --verbose            Show detailed output"
    echo ""
    echo "Examples:"
    echo "  $0 status                           # Show status on all nodes"
    echo "  $0 status --node <node>            # Show status on node from cluster.json"
    echo "  $0 logs <node>                     # Show logs from node in cluster.json"
    echo "  $0 health --watch                  # Continuous health monitoring"
    echo "  $0 deploy                          # Deploy to all nodes"
    echo ""
}

check_node_connectivity() {
    local node=$1
    local ip=$2
    
    if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
        return 0
    else
        return 1
    fi
}

show_service_status() {
    local target_node=$1
    
    log_header "๐Ÿ” autoSMART Service Status"
    log_header "============================="
    
    for i in "${!NODES[@]}"; do
        local node="${NODES[$i]}"
        local ip="${NODE_IPS[$i]}"
        
        # Skip if specific node requested and this isn't it
        if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
            continue
        fi
        
        echo ""
        log_info "Node: $node ($NODE_IP_BASE.$ip)"
        echo "----------------------------------------"
        
        if check_node_connectivity "$node" "$ip"; then
            local status_output
            status_output=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NODE_IP_BASE.$ip" \
                "systemctl is-active autosmart 2>/dev/null || echo 'inactive'; \
                 systemctl is-enabled autosmart 2>/dev/null || echo 'disabled'; \
                 uptime | awk '{print \$3, \$4}' | sed 's/,//'" 2>/dev/null)
            
            if [[ $? -eq 0 ]]; then
                local active=$(echo "$status_output" | sed -n '1p')
                local enabled=$(echo "$status_output" | sed -n '2p')
                local uptime=$(echo "$status_output" | sed -n '3p')
                
                echo -n "  Status: "
                if [[ "$active" == "active" ]]; then
                    log_success "RUNNING"
                else
                    log_error "NOT RUNNING"
                fi
                
                echo -n "  Enabled: "
                if [[ "$enabled" == "enabled" ]]; then
                    log_success "YES"
                else
                    log_warning "NO"
                fi
                
                echo "  Uptime: $uptime"
                
                # Get recent activity
                local last_log=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
                    "journalctl -u autosmart --no-pager -n 1 --output=short-iso 2>/dev/null | tail -1" 2>/dev/null)
                if [[ -n "$last_log" ]]; then
                    echo "  Last Activity: $(echo "$last_log" | awk '{print $1, $2}')"
                fi
                
            else
                log_error "SSH CONNECTION FAILED"
            fi
        else
            log_error "NETWORK UNREACHABLE"
        fi
    done
}

show_logs() {
    local target_node=$1
    local lines=${2:-20}
    
    log_header "๐Ÿ“‹ Recent Logs"
    log_header "==============="
    
    for i in "${!NODES[@]}"; do
        local node="${NODES[$i]}"
        local ip="${NODE_IPS[$i]}"
        
        # Skip if specific node requested and this isn't it
        if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
            continue
        fi
        
        echo ""
        log_info "Node: $node ($NODE_IP_BASE.$ip)"
        echo "----------------------------------------"
        
        if check_node_connectivity "$node" "$ip"; then
            ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
                "journalctl -u autosmart --no-pager -n $lines --output=short-iso 2>/dev/null || echo 'No logs available'" 2>/dev/null
        else
            log_error "Node unreachable"
        fi
    done
}

control_services() {
    local action=$1
    local target_node=$2
    
    log_header "๐Ÿ”ง ${action^} Services"
    log_header "==================="
    
    for i in "${!NODES[@]}"; do
        local node="${NODES[$i]}"
        local ip="${NODE_IPS[$i]}"
        
        # Skip if specific node requested and this isn't it
        if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
            continue
        fi
        
        echo ""
        log_info "Node: $node - ${action}ing autosmart service..."
        
        if check_node_connectivity "$node" "$ip"; then
            if ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" "systemctl $action autosmart" 2>/dev/null; then
                log_success "$node: Service ${action}ed successfully"
            else
                log_error "$node: Failed to $action service"
            fi
        else
            log_error "$node: Node unreachable"
        fi
    done
}

show_database_stats() {
    log_header "๐Ÿ“Š Database Statistics"
    log_header "====================="
    
    if command -v psql &> /dev/null; then
        echo ""
        log_info "Connection: $DB_HOST:5432/$DB_NAME"
        echo ""
        
        # Test connection
        if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
            log_success "Database connection: OK"
            echo ""
            
            # Get statistics
            PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
            SELECT 
                'Total Drives' as metric, COUNT(DISTINCT serial_number)::text as value
            FROM hdd_inventory
            UNION ALL
            SELECT 
                'Active Nodes', COUNT(DISTINCT current_node_id)::text
            FROM hdd_inventory WHERE last_seen > NOW() - INTERVAL '1 hour'
            UNION ALL
            SELECT 
                'Total Readings', COUNT(*)::text
            FROM smart_readings
            UNION ALL
            SELECT 
                'Readings Today', COUNT(*)::text
            FROM smart_readings WHERE timestamp > CURRENT_DATE
            UNION ALL
            SELECT 
                'Latest Reading', MAX(timestamp)::text
            FROM smart_readings;
            " 2>/dev/null
            
            echo ""
            log_info "Storage Efficiency:"
            PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
            SELECT 
                hi.serial_number,
                hi.model_name,
                COUNT(sr.id) as readings,
                COUNT(DISTINCT sr.parameters_json) as unique_sets,
                CASE 
                    WHEN COUNT(DISTINCT sr.parameters_json) > 0 
                    THEN ROUND((1 - COUNT(DISTINCT sr.parameters_json)::decimal / COUNT(sr.id)) * 100, 1)
                    ELSE 0 
                END as savings_percent
            FROM hdd_inventory hi
            LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id
            GROUP BY hi.id, hi.serial_number, hi.model_name
            HAVING COUNT(sr.id) > 0
            ORDER BY readings DESC;
            " 2>/dev/null
            
        else
            log_error "Database connection failed"
            log_info "Please check:"
            log_info "  โ€ข PostgreSQL server is running on $DB_HOST"
            log_info "  โ€ข Database '$DB_NAME' exists"
            log_info "  โ€ข User '$DB_USER' has proper permissions"
        fi
    else
        log_warning "psql not installed. Cannot check database statistics."
    fi
}

show_cluster_health() {
    local watch_mode=$1
    
    while true; do
        clear
        log_header "๐Ÿฅ Cluster Health Summary"
        log_header "========================="
        echo "Last Update: $(date)"
        echo ""
        
        # Service status summary
        local total_nodes=0
        local active_nodes=0
        local enabled_nodes=0
        
        for i in "${!NODES[@]}"; do
            local node="${NODES[$i]}"
            local ip="${NODE_IPS[$i]}"
            
            if check_node_connectivity "$node" "$ip"; then
                ((total_nodes++))
                
                local status=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
                    "systemctl is-active autosmart 2>/dev/null" 2>/dev/null)
                local enabled=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
                    "systemctl is-enabled autosmart 2>/dev/null" 2>/dev/null)
                
                if [[ "$status" == "active" ]]; then
                    ((active_nodes++))
                fi
                
                if [[ "$enabled" == "enabled" ]]; then
                    ((enabled_nodes++))
                fi
            fi
        done
        
        echo "๐Ÿ“ก Cluster Status:"
        echo "  โ€ข Total Nodes: $total_nodes/${#NODES[@]}"
        echo "  โ€ข Active Services: $active_nodes/$total_nodes"
        echo "  โ€ข Enabled Services: $enabled_nodes/$total_nodes"
        echo ""
        
        # Quick database check
        if command -v psql &> /dev/null; then
            if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
                local db_stats=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -t -c "
                SELECT 
                    COUNT(DISTINCT serial_number) || '|' ||
                    COUNT(DISTINCT current_node_id) || '|' ||
                    COUNT(*) || '|' ||
                    MAX(timestamp)
                FROM hdd_inventory hi 
                LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id;
                " 2>/dev/null | xargs)
                
                IFS='|' read -r drives nodes readings latest <<< "$db_stats"
                
                echo "๐Ÿ—„๏ธ  Database Status:"
                echo "  โ€ข Connection: OK"
                echo "  โ€ข Drives Tracked: $drives"
                echo "  โ€ข Active Nodes: $nodes"
                echo "  โ€ข Total Readings: $readings"
                echo "  โ€ข Latest Reading: $(echo "$latest" | cut -d'.' -f1)"
            else
                echo "๐Ÿ—„๏ธ  Database Status: โŒ CONNECTION FAILED"
            fi
        fi
        
        if [[ "$watch_mode" != "watch" ]]; then
            break
        fi
        
        echo ""
        echo "Press Ctrl+C to exit watch mode..."
        sleep 10
    done
}

deploy_cluster() {
    log_header "๐Ÿš€ Cluster Deployment"
    log_header "==================="
    
    local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
    local deploy_script="$script_dir/deploy-production.sh"
    
    if [[ -f "$deploy_script" ]]; then
        log_info "Running cluster deployment script..."
        bash "$deploy_script"
    else
        log_error "Deployment script not found: $deploy_script"
        log_info "Deploying manually to each node..."
        
        for i in "${!NODES[@]}"; do
            local node="${NODES[$i]}"
            local ip="${NODE_IPS[$i]}"
            
            echo ""
            log_info "Deploying to $node ($NODE_IP_BASE.$ip)..."
            
            if check_node_connectivity "$node" "$ip"; then
                # Copy autoSMART files
                scp -r "$(dirname "$script_dir")"/* "root@$NODE_IP_BASE.$ip:/tmp/autosmart-deploy/" 2>/dev/null
                
                # Run installation
                ssh "root@$NODE_IP_BASE.$ip" "cd /tmp/autosmart-deploy/scripts && bash deploy.sh install --force-reinstall --node-id $node" 2>/dev/null
                
                if [[ $? -eq 0 ]]; then
                    log_success "$node: Deployment successful"
                else
                    log_error "$node: Deployment failed"
                fi
            else
                log_error "$node: Node unreachable"
            fi
        done
    fi
}

force_collection() {
    log_header "๐Ÿ”„ Force SMART Collection"
    log_header "========================="
    
    for i in "${!NODES[@]}"; do
        local node="${NODES[$i]}"
        local ip="${NODE_IPS[$i]}"
        
        echo ""
        log_info "Node: $node - Triggering SMART collection..."
        
        if check_node_connectivity "$node" "$ip"; then
            # Send SIGHUP to daemon to trigger immediate collection
            ssh "root@$NODE_IP_BASE.$ip" "pkill -HUP -f smart-collector-daemon || systemctl reload autosmart" 2>/dev/null
            
            if [[ $? -eq 0 ]]; then
                log_success "$node: Collection triggered"
            else
                log_warning "$node: Signal sent, check service status"
            fi
        else
            log_error "$node: Node unreachable"
        fi
    done
}

# Parse command line arguments
COMMAND=""
TARGET_NODE=""
WATCH_MODE=false
VERBOSE=false

while [[ $# -gt 0 ]]; do
    case $1 in
        status|logs|start|stop|restart|deploy|database|health|collect)
            COMMAND="$1"
            shift
            ;;
        --node)
            TARGET_NODE="$2"
            shift 2
            ;;
        --watch)
            WATCH_MODE=true
            shift
            ;;
        --verbose)
            VERBOSE=true
            shift
            ;;
        --help)
            show_usage
            exit 0
            ;;
        ebony|ivory|obsidian)
            # Allow node names as direct arguments for logs command
            if [[ "$COMMAND" == "logs" ]]; then
                TARGET_NODE="$1"
            fi
            shift
            ;;
        *)
            if [[ -z "$COMMAND" ]]; then
                COMMAND="$1"
            else
                log_error "Unknown option: $1"
                show_usage
                exit 1
            fi
            shift
            ;;
    esac
done

# Default command
if [[ -z "$COMMAND" ]]; then
    COMMAND="status"
fi

# Execute command
case "$COMMAND" in
    status)
        if [[ "$WATCH_MODE" == true ]]; then
            while true; do
                clear
                show_service_status "$TARGET_NODE"
                echo ""
                echo "Press Ctrl+C to exit watch mode..."
                sleep 10
            done
        else
            show_service_status "$TARGET_NODE"
        fi
        ;;
    logs)
        show_logs "$TARGET_NODE"
        ;;
    start|stop|restart)
        control_services "$COMMAND" "$TARGET_NODE"
        ;;
    database)
        show_database_stats
        ;;
    health)
        show_cluster_health "$([[ "$WATCH_MODE" == true ]] && echo "watch")"
        ;;
    deploy)
        deploy_cluster
        ;;
    collect)
        force_collection
        ;;
    *)
        log_error "Unknown command: $COMMAND"
        show_usage
        exit 1
        ;;
esac

exit 0