Newer Older
f16725e 3 months ago History
521 lines | 16.604kb
Bogdan Timofte authored 3 months ago
1
#!/bin/bash
2

            
3
# autoSMART Cluster Monitor
4
# Version: 1.0
5
# Description: Monitor autoSMART services across Proxmox cluster
6

            
7
# Configuration
8
CLUSTER_JSON="$(dirname "$0")/../cluster.json"
9
NODES=()
10
NODE_IPS=()
11
if [[ -f "$CLUSTER_JSON" ]] && command -v jq &> /dev/null; then
12
    while IFS= read -r node; do
13
        NODES+=("$(echo "$node" | jq -r '.hostname')")
14
        NODE_IPS+=("$(echo "$node" | jq -r '.ip')")
15
    done < <(jq -c '.cluster.nodes[]' "$CLUSTER_JSON")
16
fi
17
DB_HOST="192.168.2.102"
18
DB_USER="autosmart"
19
DB_PASS="autoSMART2025!"
20
DB_NAME="autosmart"
21

            
22
# Colors for output
23
RED='\033[0;31m'
24
GREEN='\033[0;32m'
25
YELLOW='\033[1;33m'
26
BLUE='\033[0;34m'
27
CYAN='\033[0;36m'
28
NC='\033[0m' # No Color
29

            
30
log_info() {
31
    echo -e "${BLUE}[INFO]${NC} $1"
32
}
33

            
34
log_success() {
35
    echo -e "${GREEN}[SUCCESS]${NC} $1"
36
}
37

            
38
log_warning() {
39
    echo -e "${YELLOW}[WARNING]${NC} $1"
40
}
41

            
42
log_error() {
43
    echo -e "${RED}[ERROR]${NC} $1"
44
}
45

            
46
log_header() {
47
    echo -e "${CYAN}$1${NC}"
48
}
49

            
50
show_usage() {
51
    echo "autoSMART Cluster Monitor v1.0"
52
    echo ""
53
    echo "Usage: $0 [COMMAND] [OPTIONS]"
54
    echo ""
55
    echo "Commands:"
56
    echo "  status                Show service status on all nodes"
57
    echo "  logs [NODE]          Show recent logs (all nodes or specific node)"
58
    echo "  start                Start services on all nodes"
59
    echo "  stop                 Stop services on all nodes"
60
    echo "  restart              Restart services on all nodes"
61
    echo "  deploy               Deploy autoSMART to all nodes"
62
    echo "  database             Show database statistics"
63
    echo "  health               Show cluster health summary"
64
    echo "  collect              Force immediate SMART collection on all nodes"
65
    echo ""
66
    echo "Options:"
67
    echo "  --node NODE          Target specific node (name from cluster.json)"
68
    echo "  --watch              Continuous monitoring (refresh every 10s)"
69
    echo "  --verbose            Show detailed output"
70
    echo ""
71
    echo "Examples:"
72
    echo "  $0 status                           # Show status on all nodes"
73
    echo "  $0 status --node <node>            # Show status on node from cluster.json"
74
    echo "  $0 logs <node>                     # Show logs from node in cluster.json"
75
    echo "  $0 health --watch                  # Continuous health monitoring"
76
    echo "  $0 deploy                          # Deploy to all nodes"
77
    echo ""
78
}
79

            
80
check_node_connectivity() {
81
    local node=$1
82
    local ip=$2
83

            
84
    if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
85
        return 0
86
    else
87
        return 1
88
    fi
89
}
90

            
91
show_service_status() {
92
    local target_node=$1
93

            
94
    log_header "🔍 autoSMART Service Status"
95
    log_header "============================="
96

            
97
    for i in "${!NODES[@]}"; do
98
        local node="${NODES[$i]}"
99
        local ip="${NODE_IPS[$i]}"
100

            
101
        # Skip if specific node requested and this isn't it
102
        if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
103
            continue
104
        fi
105

            
106
        echo ""
107
        log_info "Node: $node ($NODE_IP_BASE.$ip)"
108
        echo "----------------------------------------"
109

            
110
        if check_node_connectivity "$node" "$ip"; then
111
            local status_output
112
            status_output=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NODE_IP_BASE.$ip" \
113
                "systemctl is-active autosmart 2>/dev/null || echo 'inactive'; \
114
                 systemctl is-enabled autosmart 2>/dev/null || echo 'disabled'; \
115
                 uptime | awk '{print \$3, \$4}' | sed 's/,//'" 2>/dev/null)
116

            
117
            if [[ $? -eq 0 ]]; then
118
                local active=$(echo "$status_output" | sed -n '1p')
119
                local enabled=$(echo "$status_output" | sed -n '2p')
120
                local uptime=$(echo "$status_output" | sed -n '3p')
121

            
122
                echo -n "  Status: "
123
                if [[ "$active" == "active" ]]; then
124
                    log_success "RUNNING"
125
                else
126
                    log_error "NOT RUNNING"
127
                fi
128

            
129
                echo -n "  Enabled: "
130
                if [[ "$enabled" == "enabled" ]]; then
131
                    log_success "YES"
132
                else
133
                    log_warning "NO"
134
                fi
135

            
136
                echo "  Uptime: $uptime"
137

            
138
                # Get recent activity
139
                local last_log=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
140
                    "journalctl -u autosmart --no-pager -n 1 --output=short-iso 2>/dev/null | tail -1" 2>/dev/null)
141
                if [[ -n "$last_log" ]]; then
142
                    echo "  Last Activity: $(echo "$last_log" | awk '{print $1, $2}')"
143
                fi
144

            
145
            else
146
                log_error "SSH CONNECTION FAILED"
147
            fi
148
        else
149
            log_error "NETWORK UNREACHABLE"
150
        fi
151
    done
152
}
153

            
154
show_logs() {
155
    local target_node=$1
156
    local lines=${2:-20}
157

            
158
    log_header "📋 Recent Logs"
159
    log_header "==============="
160

            
161
    for i in "${!NODES[@]}"; do
162
        local node="${NODES[$i]}"
163
        local ip="${NODE_IPS[$i]}"
164

            
165
        # Skip if specific node requested and this isn't it
166
        if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
167
            continue
168
        fi
169

            
170
        echo ""
171
        log_info "Node: $node ($NODE_IP_BASE.$ip)"
172
        echo "----------------------------------------"
173

            
174
        if check_node_connectivity "$node" "$ip"; then
175
            ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
176
                "journalctl -u autosmart --no-pager -n $lines --output=short-iso 2>/dev/null || echo 'No logs available'" 2>/dev/null
177
        else
178
            log_error "Node unreachable"
179
        fi
180
    done
181
}
182

            
183
control_services() {
184
    local action=$1
185
    local target_node=$2
186

            
187
    log_header "🔧 ${action^} Services"
188
    log_header "==================="
189

            
190
    for i in "${!NODES[@]}"; do
191
        local node="${NODES[$i]}"
192
        local ip="${NODE_IPS[$i]}"
193

            
194
        # Skip if specific node requested and this isn't it
195
        if [[ -n "$target_node" && "$node" != "$target_node" ]]; then
196
            continue
197
        fi
198

            
199
        echo ""
200
        log_info "Node: $node - ${action}ing autosmart service..."
201

            
202
        if check_node_connectivity "$node" "$ip"; then
203
            if ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" "systemctl $action autosmart" 2>/dev/null; then
204
                log_success "$node: Service ${action}ed successfully"
205
            else
206
                log_error "$node: Failed to $action service"
207
            fi
208
        else
209
            log_error "$node: Node unreachable"
210
        fi
211
    done
212
}
213

            
214
show_database_stats() {
215
    log_header "📊 Database Statistics"
216
    log_header "====================="
217

            
218
    if command -v psql &> /dev/null; then
219
        echo ""
220
        log_info "Connection: $DB_HOST:5432/$DB_NAME"
221
        echo ""
222

            
223
        # Test connection
224
        if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
225
            log_success "Database connection: OK"
226
            echo ""
227

            
228
            # Get statistics
229
            PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
230
            SELECT
231
                'Total Drives' as metric, COUNT(DISTINCT serial_number)::text as value
232
            FROM hdd_inventory
233
            UNION ALL
234
            SELECT
235
                'Active Nodes', COUNT(DISTINCT current_node_id)::text
236
            FROM hdd_inventory WHERE last_seen > NOW() - INTERVAL '1 hour'
237
            UNION ALL
238
            SELECT
239
                'Total Readings', COUNT(*)::text
240
            FROM smart_readings
241
            UNION ALL
242
            SELECT
243
                'Readings Today', COUNT(*)::text
244
            FROM smart_readings WHERE timestamp > CURRENT_DATE
245
            UNION ALL
246
            SELECT
247
                'Latest Reading', MAX(timestamp)::text
248
            FROM smart_readings;
249
            " 2>/dev/null
250

            
251
            echo ""
252
            log_info "Storage Efficiency:"
253
            PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
254
            SELECT
255
                hi.serial_number,
256
                hi.model_name,
257
                COUNT(sr.id) as readings,
258
                COUNT(DISTINCT sr.parameters_json) as unique_sets,
259
                CASE
260
                    WHEN COUNT(DISTINCT sr.parameters_json) > 0
261
                    THEN ROUND((1 - COUNT(DISTINCT sr.parameters_json)::decimal / COUNT(sr.id)) * 100, 1)
262
                    ELSE 0
263
                END as savings_percent
264
            FROM hdd_inventory hi
265
            LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id
266
            GROUP BY hi.id, hi.serial_number, hi.model_name
267
            HAVING COUNT(sr.id) > 0
268
            ORDER BY readings DESC;
269
            " 2>/dev/null
270

            
271
        else
272
            log_error "Database connection failed"
273
            log_info "Please check:"
274
            log_info "  • PostgreSQL server is running on $DB_HOST"
275
            log_info "  • Database '$DB_NAME' exists"
276
            log_info "  • User '$DB_USER' has proper permissions"
277
        fi
278
    else
279
        log_warning "psql not installed. Cannot check database statistics."
280
    fi
281
}
282

            
283
show_cluster_health() {
284
    local watch_mode=$1
285

            
286
    while true; do
287
        clear
288
        log_header "🏥 Cluster Health Summary"
289
        log_header "========================="
290
        echo "Last Update: $(date)"
291
        echo ""
292

            
293
        # Service status summary
294
        local total_nodes=0
295
        local active_nodes=0
296
        local enabled_nodes=0
297

            
298
        for i in "${!NODES[@]}"; do
299
            local node="${NODES[$i]}"
300
            local ip="${NODE_IPS[$i]}"
301

            
302
            if check_node_connectivity "$node" "$ip"; then
303
                ((total_nodes++))
304

            
305
                local status=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
306
                    "systemctl is-active autosmart 2>/dev/null" 2>/dev/null)
307
                local enabled=$(ssh -o ConnectTimeout=5 "root@$NODE_IP_BASE.$ip" \
308
                    "systemctl is-enabled autosmart 2>/dev/null" 2>/dev/null)
309

            
310
                if [[ "$status" == "active" ]]; then
311
                    ((active_nodes++))
312
                fi
313

            
314
                if [[ "$enabled" == "enabled" ]]; then
315
                    ((enabled_nodes++))
316
                fi
317
            fi
318
        done
319

            
320
        echo "📡 Cluster Status:"
321
        echo "  • Total Nodes: $total_nodes/${#NODES[@]}"
322
        echo "  • Active Services: $active_nodes/$total_nodes"
323
        echo "  • Enabled Services: $enabled_nodes/$total_nodes"
324
        echo ""
325

            
326
        # Quick database check
327
        if command -v psql &> /dev/null; then
328
            if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
329
                local db_stats=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -t -c "
330
                SELECT
331
                    COUNT(DISTINCT serial_number) || '|' ||
332
                    COUNT(DISTINCT current_node_id) || '|' ||
333
                    COUNT(*) || '|' ||
334
                    MAX(timestamp)
335
                FROM hdd_inventory hi
336
                LEFT JOIN smart_readings sr ON hi.id = sr.hdd_id;
337
                " 2>/dev/null | xargs)
338

            
339
                IFS='|' read -r drives nodes readings latest <<< "$db_stats"
340

            
341
                echo "🗄️  Database Status:"
342
                echo "  • Connection: OK"
343
                echo "  • Drives Tracked: $drives"
344
                echo "  • Active Nodes: $nodes"
345
                echo "  • Total Readings: $readings"
346
                echo "  • Latest Reading: $(echo "$latest" | cut -d'.' -f1)"
347
            else
348
                echo "🗄️  Database Status: ❌ CONNECTION FAILED"
349
            fi
350
        fi
351

            
352
        if [[ "$watch_mode" != "watch" ]]; then
353
            break
354
        fi
355

            
356
        echo ""
357
        echo "Press Ctrl+C to exit watch mode..."
358
        sleep 10
359
    done
360
}
361

            
362
deploy_cluster() {
363
    log_header "🚀 Cluster Deployment"
364
    log_header "==================="
365

            
366
    local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
367
    local deploy_script="$script_dir/deploy-production.sh"
368

            
369
    if [[ -f "$deploy_script" ]]; then
370
        log_info "Running cluster deployment script..."
371
        bash "$deploy_script"
372
    else
373
        log_error "Deployment script not found: $deploy_script"
374
        log_info "Deploying manually to each node..."
375

            
376
        for i in "${!NODES[@]}"; do
377
            local node="${NODES[$i]}"
378
            local ip="${NODE_IPS[$i]}"
379

            
380
            echo ""
381
            log_info "Deploying to $node ($NODE_IP_BASE.$ip)..."
382

            
383
            if check_node_connectivity "$node" "$ip"; then
384
                # Copy autoSMART files
385
                scp -r "$(dirname "$script_dir")"/* "root@$NODE_IP_BASE.$ip:/tmp/autosmart-deploy/" 2>/dev/null
386

            
387
                # Run installation
388
                ssh "root@$NODE_IP_BASE.$ip" "cd /tmp/autosmart-deploy/scripts && bash deploy.sh install --force-reinstall --node-id $node" 2>/dev/null
389

            
390
                if [[ $? -eq 0 ]]; then
391
                    log_success "$node: Deployment successful"
392
                else
393
                    log_error "$node: Deployment failed"
394
                fi
395
            else
396
                log_error "$node: Node unreachable"
397
            fi
398
        done
399
    fi
400
}
401

            
402
force_collection() {
403
    log_header "🔄 Force SMART Collection"
404
    log_header "========================="
405

            
406
    for i in "${!NODES[@]}"; do
407
        local node="${NODES[$i]}"
408
        local ip="${NODE_IPS[$i]}"
409

            
410
        echo ""
411
        log_info "Node: $node - Triggering SMART collection..."
412

            
413
        if check_node_connectivity "$node" "$ip"; then
414
            # Send SIGHUP to daemon to trigger immediate collection
415
            ssh "root@$NODE_IP_BASE.$ip" "pkill -HUP -f smart-collector-daemon || systemctl reload autosmart" 2>/dev/null
416

            
417
            if [[ $? -eq 0 ]]; then
418
                log_success "$node: Collection triggered"
419
            else
420
                log_warning "$node: Signal sent, check service status"
421
            fi
422
        else
423
            log_error "$node: Node unreachable"
424
        fi
425
    done
426
}
427

            
428
# Parse command line arguments
429
COMMAND=""
430
TARGET_NODE=""
431
WATCH_MODE=false
432
VERBOSE=false
433

            
434
while [[ $# -gt 0 ]]; do
435
    case $1 in
436
        status|logs|start|stop|restart|deploy|database|health|collect)
437
            COMMAND="$1"
438
            shift
439
            ;;
440
        --node)
441
            TARGET_NODE="$2"
442
            shift 2
443
            ;;
444
        --watch)
445
            WATCH_MODE=true
446
            shift
447
            ;;
448
        --verbose)
449
            VERBOSE=true
450
            shift
451
            ;;
452
        --help)
453
            show_usage
454
            exit 0
455
            ;;
456
        ebony|ivory|obsidian)
457
            # Allow node names as direct arguments for logs command
458
            if [[ "$COMMAND" == "logs" ]]; then
459
                TARGET_NODE="$1"
460
            fi
461
            shift
462
            ;;
463
        *)
464
            if [[ -z "$COMMAND" ]]; then
465
                COMMAND="$1"
466
            else
467
                log_error "Unknown option: $1"
468
                show_usage
469
                exit 1
470
            fi
471
            shift
472
            ;;
473
    esac
474
done
475

            
476
# Default command
477
if [[ -z "$COMMAND" ]]; then
478
    COMMAND="status"
479
fi
480

            
481
# Execute command
482
case "$COMMAND" in
483
    status)
484
        if [[ "$WATCH_MODE" == true ]]; then
485
            while true; do
486
                clear
487
                show_service_status "$TARGET_NODE"
488
                echo ""
489
                echo "Press Ctrl+C to exit watch mode..."
490
                sleep 10
491
            done
492
        else
493
            show_service_status "$TARGET_NODE"
494
        fi
495
        ;;
496
    logs)
497
        show_logs "$TARGET_NODE"
498
        ;;
499
    start|stop|restart)
500
        control_services "$COMMAND" "$TARGET_NODE"
501
        ;;
502
    database)
503
        show_database_stats
504
        ;;
505
    health)
506
        show_cluster_health "$([[ "$WATCH_MODE" == true ]] && echo "watch")"
507
        ;;
508
    deploy)
509
        deploy_cluster
510
        ;;
511
    collect)
512
        force_collection
513
        ;;
514
    *)
515
        log_error "Unknown command: $COMMAND"
516
        show_usage
517
        exit 1
518
        ;;
519
esac
520

            
521
exit 0