Newer Older
f16725e 3 months ago History
1352 lines | 43.254kb
Bogdan Timofte authored 3 months ago
1
#!/bin/bash
2

            
3
# pgs
4
# Manages VM and CT suspend/shutdown for planned maintenance.
5
#
6
# Before maintenance (suspend mode):
7
#   - Suspends all running VMs to disk
8
#   - Gracefully shuts down all running CTs
9
#   - Saves state to a list for restoration
10
#   - VMs already suspended to disk: logged as warning, not auto-resumed
11
#   - VMs suspended to RAM: suspended to disk but not auto-resumed (preserving user intent)
12
#
13
# After maintenance (resume mode):
14
#   - Resumes VMs from the saved list
15
#   - Starts CTs from the saved list
16
#   - Logs warnings for VMs/CTs skipped
17
#   - Logs errors for VMs/CTs that fail to resume/start
18
#
19
# Usage: pgs suspend|resume [--dry-run] [-v]
20
#
21
# Version: 1.4 - Standardized xdev state path with legacy state migration
22
#
23
# TODO: Implement critical VM/CT migration support.
24
#       Critical guests (tagged or listed) should be live-migrated to another
25
#       node before maintenance instead of suspended/stopped. Rules TBD:
26
#       - Which guests are critical (tag? config flag? external list?)
27
#       - Target node selection (least loaded? affinity rules?)
28
#       - Fallback if migration fails (suspend locally?)
29
#       - Post-maintenance: migrate back or leave on target node?
30

            
31
PROJECT_ID="pve-guests-state"
32
ORG_ID="xdev"
33
DEFAULT_STATE_DIR="/var/lib/${ORG_ID}/${PROJECT_ID}"
34
LEGACY_STATE_DIR="/var/lib/pve-manager"
35
LEGACY_STATE_FILE="${LEGACY_STATE_DIR}/pgs-state.json"
36
STATE_DIR="${PGS_STATE_DIR:-${DEFAULT_STATE_DIR}}"
37
STATE_FILE="${STATE_DIR}/pgs-state.json"
38
LOCK_FILE="/run/pgs.lock"
39
SCRIPT_NAME=$(basename "$0")
40

            
41
DRY_RUN=0
42
VERBOSE=0
43
QUORUM_RELAXED=0
44

            
45
# Associative arrays for VM data (populated once)
46
declare -A VM_STATUS
47
declare -A VM_NAME
48
declare -A VM_HAS_LOCK
49
declare -A VM_VMSTATE
50
declare -A VMSTATE_TO_VMID
51

            
52
# Associative arrays for CT data (populated once)
53
declare -A CT_STATUS
54
declare -A CT_NAME
55

            
56
# Logging functions.
57
# When running inside systemd (JOURNAL_STREAM is set), stdout goes directly to
58
# the journal - calling logger in addition causes duplicate entries. When running
59
# interactively, use both echo (terminal) and logger (journal archive).
60
_log() {
61
    local level="$1" prefix="$2"; shift 2
62
    echo "$prefix $*"
63
    [[ -z "${JOURNAL_STREAM:-}" ]] && logger -t "$SCRIPT_NAME" -p "$level" "$*"
64
}
65

            
66
log_info() {
67
    # When in systemd: always log regardless of VERBOSE (journal is the destination)
68
    # When interactive: only log if -v is set
69
    if [[ -n "${JOURNAL_STREAM:-}" ]] || [[ $VERBOSE -ge 1 ]]; then
70
        _log user.info "[INFO]" "$@"
71
    fi
72
}
73

            
74
log_debug() {
75
    if [[ -n "${JOURNAL_STREAM:-}" ]] || [[ $VERBOSE -ge 2 ]]; then
76
        _log user.debug "[DEBUG]" "$@"
77
    fi
78
}
79

            
80
log_warning() {
81
    _log user.warning "[WARNING]" "$@"
82
}
83

            
84
log_error() {
85
    echo "[ERROR] $*" >&2
86
    [[ -z "${JOURNAL_STREAM:-}" ]] && logger -t "$SCRIPT_NAME" -p user.err "$*"
87
}
88

            
89
log_success() {
90
    _log user.notice "[SUCCESS]" "$@"
91
}
92

            
93
usage() {
94
    cat <<EOF
95
Usage: $0 suspend|resume|cleanup [OPTIONS]
96

            
97
Manage VM and CT suspend/shutdown for planned maintenance.
98

            
99
Commands:
100
  suspend    Suspend running VMs to disk, shutdown running CTs
101
  resume     Resume VMs and start CTs from saved state
102
  cleanup    Remove stale suspend artifacts from config and storage
103

            
104
Options:
105
  -n, --dry-run    Show what would be done without making changes
106
  -v, --verbose    Print informational messages (-vv adds debug detail)
107
  -h, --help       Display this help and exit
108

            
109
Examples:
110
  $0 suspend              # Suspend VMs, shutdown CTs
111
  $0 resume               # Resume VMs, start CTs
112
  $0 cleanup -v           # Remove orphan/stale suspend artifacts
113
  $0 cleanup -vv          # Include real filesystem paths in cleanup output
114
  $0 suspend --dry-run    # Show what would happen
115
EOF
116
}
117

            
118
refresh_vm_artifact_metadata() {
119
    VM_HAS_LOCK=()
120
    VM_VMSTATE=()
121
    VMSTATE_TO_VMID=()
122

            
123
    for conf in /etc/pve/qemu-server/*.conf; do
124
        [[ ! -f "$conf" ]] && continue
125
        local vmid=$(basename "$conf" .conf)
126
        if grep -q '^lock: suspended$' "$conf" 2>/dev/null; then
127
            VM_HAS_LOCK[$vmid]=1
128
        fi
129
        local vmstate
130
        vmstate=$(awk -F': ' '/^vmstate: / {print $2; exit}' "$conf" 2>/dev/null)
131
        if [[ -n "$vmstate" ]]; then
132
            VM_VMSTATE[$vmid]="$vmstate"
133
            VMSTATE_TO_VMID[$vmstate]="$vmid"
134
        fi
135
    done
136
}
137

            
138
load_vm_config_metadata() {
139
    VM_STATUS=()
140
    VM_NAME=()
141

            
142
    while read -r vmid name status _rest; do
143
        [[ "$vmid" == "VMID" ]] && continue
144
        VM_NAME[$vmid]="$name"
145
    done < <(qm list 2>/dev/null)
146

            
147
    refresh_vm_artifact_metadata
148
}
149

            
150
# Load all VM info in one pass - FAST
151
load_vm_info() {
152
    load_vm_config_metadata
153

            
154
    # Get status and name from qm list (single call)
155
    while read -r vmid name status _rest; do
156
        [[ "$vmid" == "VMID" ]] && continue  # skip header
157
        VM_STATUS[$vmid]="$status"
158
        VM_NAME[$vmid]="$name"
159
    done < <(qm list 2>/dev/null)
160

            
161
    # For "running" VMs, get actual status (qm list shows "running" for paused/suspended VMs)
162
    # This is only a few VMs so the overhead is acceptable
163
    for vmid in "${!VM_STATUS[@]}"; do
164
        if [[ "${VM_STATUS[$vmid]}" == "running" ]]; then
165
            local real_status
166
            real_status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}')
167
            [[ -n "$real_status" ]] && VM_STATUS[$vmid]="$real_status"
168
        fi
169
    done
170
}
171

            
172
array_contains() {
173
    local needle="$1"
174
    shift
175
    local item
176
    for item in "$@"; do
177
        [[ "$item" == "$needle" ]] && return 0
178
    done
179
    return 1
180
}
181

            
182
append_unique() {
183
    local -n target_ref=$1
184
    local value="$2"
185

            
186
    array_contains "$value" "${target_ref[@]}" || target_ref+=("$value")
187
}
188

            
189
remove_value() {
190
    local -n target_ref=$1
191
    local value="$2"
192
    local filtered=()
193
    local item
194

            
195
    for item in "${target_ref[@]}"; do
196
        [[ "$item" == "$value" ]] && continue
197
        filtered+=("$item")
198
    done
199

            
200
    target_ref=("${filtered[@]}")
201
}
202

            
203
extract_suspend_file_date() {
204
    local vmid="$1"
205
    local volume="$2"
206
    local volume_name="${volume##*/}"
207

            
208
    if [[ "$volume_name" =~ ^vm-${vmid}-state-suspend-([0-9]{4}-[0-9]{2}-[0-9]{2})\.raw$ ]]; then
209
        echo "${BASH_REMATCH[1]}"
210
    fi
211
}
212

            
213
# Load all CT info in one pass - FAST
214
load_ct_info() {
215
    # pct list columns: VMID Status Lock Name
216
    # When Lock is empty, read shifts Name into the lock variable
217
    while read -r vmid status lock name; do
218
        [[ "$vmid" == "VMID" ]] && continue  # skip header
219
        if [[ -z "$name" ]]; then
220
            # No lock present: lock actually holds the name
221
            name="$lock"
222
            lock=""
223
        fi
224
        CT_STATUS[$vmid]="$status"
225
        CT_NAME[$vmid]="$name"
226
    done < <(pct list 2>/dev/null)
227
}
228

            
229
# Get VM name (from cache)
230
get_vm_name() {
231
    echo "${VM_NAME[$1]:-unknown}"
232
}
233

            
234
vm_has_suspend_lock() {
235
    local vmid="$1"
236
    grep -q '^lock: suspended$' "/etc/pve/qemu-server/${vmid}.conf" 2>/dev/null
237
}
238

            
239
vm_has_vmstate_reference() {
240
    local vmid="$1"
241
    grep -q '^vmstate:' "/etc/pve/qemu-server/${vmid}.conf" 2>/dev/null
242
}
243

            
244
get_vm_vmstate_volume() {
245
    local vmid="$1"
246
    echo "${VM_VMSTATE[$vmid]:-}"
247
}
248

            
249
is_strict_suspend_volume_name() {
250
    local vmid="$1"
251
    local name="$2"
252
    [[ "$name" =~ ^vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ ]]
253
}
254

            
255
storage_cleanup_supports_path_scan() {
256
    local storage_type="$1"
257

            
258
    # Cleanup walks filesystem paths directly under <path>/images.
259
    # Keep this limited to local directory-backed storages so a stale remote
260
    # mount cannot block planned maintenance in kernel I/O wait.
261
    [[ "$storage_type" == "dir" ]]
262
}
263

            
264
vmstate_volume_looks_like_suspend_artifact() {
265
    local vmid="$1"
266
    local volume="$2"
267
    local volume_name="${volume##*/}"
268

            
269
    [[ -n "$volume" ]] || return 1
270
    is_strict_suspend_volume_name "$vmid" "$volume_name"
271
}
272

            
273
resolve_storage_volume_path() {
274
    local volume="$1"
275
    pvesm path "$volume" 2>/dev/null
276
}
277

            
278
vmstate_volume_exists() {
279
    local volume="$1"
280
    local resolved_path
281

            
282
    [[ -z "$volume" ]] && return 1
283
    resolved_path=$(resolve_storage_volume_path "$volume") || return 1
284
    [[ -n "$resolved_path" && -e "$resolved_path" ]]
285
}
286

            
287
remove_suspend_volume_by_volid() {
288
    local vmid="$1"
289
    local volume="$2"
290
    local name="${VM_NAME[$vmid]:-unknown}"
291
    local free_output
292

            
293
    if ! vmstate_volume_looks_like_suspend_artifact "$vmid" "$volume"; then
294
        log_warning "VM $vmid ($name) suspend volume does not look like a suspend artifact, leaving it untouched: ${volume:-none}"
295
        return 1
296
    fi
297

            
298
    if [[ $DRY_RUN -eq 1 ]]; then
299
        echo "would remove stale vmstate volume for VM $vmid ($name): $volume"
300
        return 0
301
    fi
302

            
303
    free_output=$(pvesm free "$volume" 2>&1)
304
    if [[ $? -eq 0 ]]; then
305
        log_info "Removed stale vmstate volume for VM $vmid ($name): $volume"
306
        return 0
307
    fi
308

            
309
    if maybe_relax_quorum "$free_output"; then
310
        free_output=$(pvesm free "$volume" 2>&1)
311
        if [[ $? -eq 0 ]]; then
312
            log_info "Removed stale vmstate volume for VM $vmid ($name) after quorum recovery: $volume"
313
            return 0
314
        fi
315
    fi
316

            
317
    if echo "$free_output" | grep -qiE 'does not exist|no such file|not found'; then
318
        log_info "Stale vmstate volume for VM $vmid ($name) was already absent: $volume"
319
        return 0
320
    fi
321

            
322
    log_warning "VM $vmid ($name) stale vmstate volume could not be removed: $volume ($free_output)"
323
    return 1
324
}
325

            
326
clear_vmstate_metadata() {
327
    local vmid="$1"
328
    local name="${VM_NAME[$vmid]:-unknown}"
329
    local set_output
330

            
331
    if [[ $DRY_RUN -eq 1 ]]; then
332
        echo "would remove stale vmstate metadata for VM $vmid ($name)"
333
        return 0
334
    fi
335

            
336
    set_output=$(qm set "$vmid" --delete vmstate 2>&1)
337
    if [[ $? -eq 0 ]]; then
338
        log_info "Removed stale vmstate metadata for VM $vmid ($name)"
339
        return 0
340
    fi
341

            
342
    if maybe_relax_quorum "$set_output"; then
343
        set_output=$(qm set "$vmid" --delete vmstate 2>&1)
344
        if [[ $? -eq 0 ]]; then
345
            log_info "Removed stale vmstate metadata for VM $vmid ($name) after quorum recovery"
346
            return 0
347
        fi
348
    fi
349

            
350
    log_warning "VM $vmid ($name) stale vmstate metadata could not be removed: $set_output"
351
    return 1
352
}
353

            
354
free_stale_vmstate_volume() {
355
    local vmid="$1"
356
    local volume="$2"
357

            
358
    remove_suspend_volume_by_volid "$vmid" "$volume"
359
}
360

            
361
cleanup_stale_suspend_artifacts() {
362
    local vmid="$1"
363
    local context="${2:-}"
364
    local name="${VM_NAME[$vmid]:-unknown}"
365
    local volume
366
    local had_issue=0
367
    local cleanup_failed=0
368

            
369
    volume=$(get_vm_vmstate_volume "$vmid")
370

            
371
    if vm_has_suspend_lock "$vmid"; then
372
        had_issue=1
373
        if ! unlock_vm_suspend_lock "$vmid" "$context"; then
374
            cleanup_failed=1
375
        fi
376
    fi
377

            
378
    if [[ -n "$volume" ]]; then
379
        had_issue=1
380
        if vmstate_volume_exists "$volume"; then
381
            if ! free_stale_vmstate_volume "$vmid" "$volume"; then
382
                cleanup_failed=1
383
            fi
384
        else
385
            log_info "VM $vmid ($name) has stale vmstate metadata pointing to missing volume: $volume"
386
        fi
387

            
388
        if ! clear_vmstate_metadata "$vmid"; then
389
            cleanup_failed=1
390
        fi
391
    fi
392

            
393
    if [[ $had_issue -eq 0 ]]; then
394
        return 0
395
    fi
396

            
397
    [[ $cleanup_failed -eq 0 ]]
398
}
399

            
400
vm_has_valid_suspend_state() {
401
    local vmid="$1"
402
    local volume
403

            
404
    vm_has_suspend_lock "$vmid" || return 1
405
    vm_has_vmstate_reference "$vmid" || return 1
406
    volume=$(get_vm_vmstate_volume "$vmid")
407
    vmstate_volume_looks_like_suspend_artifact "$vmid" "$volume" || return 1
408
    vmstate_volume_exists "$volume"
409
}
410

            
411
get_referencing_vmid_for_vmstate() {
412
    local target_volume="$1"
413
    local vmid="${VMSTATE_TO_VMID[$target_volume]:-}"
414
    [[ -n "$vmid" ]] || return 1
415
    echo "$vmid"
416
    return 0
417
}
418

            
419
list_suspend_artifact_files() {
420
    awk '
421
        BEGIN {
422
            RS = ""
423
            FS = "\n"
424
        }
425
        {
426
            type = ""
427
            name = ""
428
            path = ""
429
            content = ""
430
            split($1, header_parts, /:[[:space:]]+/)
431
            if (length(header_parts) >= 2) {
432
                type = header_parts[1]
433
                name = header_parts[2]
434
            }
435

            
436
            for (i = 2; i <= NF; i++) {
437
                line = $i
438
                sub(/^\t/, "", line)
439
                if (line ~ /^path /) {
440
                    path = substr(line, 6)
441
                } else if (line ~ /^content /) {
442
                    content = substr(line, 9)
443
                }
444
            }
445

            
446
            if (name != "" && path != "" && content ~ /(^|,)images(,|$)/) {
447
                print type "\t" name "\t" path
448
            }
449
        }
450
    ' /etc/pve/storage.cfg 2>/dev/null | while IFS=$'\t' read -r storage_type storage path; do
451
        [[ -z "$storage" || -z "$path" ]] && continue
452
        if ! storage_cleanup_supports_path_scan "$storage_type"; then
453
            continue
454
        fi
455
        [[ -d "${path}/images" ]] || continue
456
        local file
457
        for file in "${path}"/images/[0-9]*/vm-*-state-suspend-????-??-??.raw; do
458
            [[ -e "$file" ]] || continue
459
            local relative_path="${file#${path}/images/}"
460
            [[ "$relative_path" == "$file" ]] && continue
461
            local vm_dir="${relative_path%%/*}"
462
            local file_name="${relative_path##*/}"
463
            [[ "$vm_dir" =~ ^[0-9]+$ ]] || continue
464
            is_strict_suspend_volume_name "$vm_dir" "$file_name" || continue
465
            printf '%s\t%s:%s/%s\t%s\n' "$storage" "$storage" "$vm_dir" "$file_name" "$file"
466
        done
467
    done
468
}
469

            
470
cleanup_orphan_suspend_artifacts() {
471
    local cleaned_count=0
472
    local skipped_count=0
473
    local fail_count=0
474
    local storage
475
    local volume
476
    local file_path
477
    local vmid
478

            
479
    log_info "Scanning storages for orphan suspend-state volumes..."
480

            
481
    while IFS=$'\t' read -r storage volume file_path; do
482
        [[ -z "$volume" ]] && continue
483

            
484
        if vmid=$(get_referencing_vmid_for_vmstate "$volume"); then
485
            if vm_has_valid_suspend_state "$vmid"; then
486
                log_info "Keeping active suspend-state volume for VM $vmid (${VM_NAME[$vmid]:-unknown}): $volume"
487
                ((skipped_count++))
488
            else
489
                log_warning "VM $vmid (${VM_NAME[$vmid]:-unknown}) references inconsistent suspend artifacts - cleaning up"
490
                if cleanup_stale_suspend_artifacts "$vmid" "during cleanup"; then
491
                    ((cleaned_count++))
492
                else
493
                    ((fail_count++))
494
                fi
495
            fi
496
            continue
497
        fi
498

            
499
        if [[ $DRY_RUN -eq 1 ]]; then
500
            echo "would remove orphan suspend-state volume: $volume"
501
            log_debug "real path: $file_path"
502
            ((cleaned_count++))
503
            continue
504
        fi
505

            
506
        if [[ "$volume" =~ ^([^:]+):([0-9]+)/vm-([0-9]+)-state-suspend-([0-9]{4}-[0-9]{2}-[0-9]{2})\.raw$ ]]; then
507
            vmid="${BASH_REMATCH[3]}"
508
        else
509
            log_warning "Skipping suspicious suspend-state volume with unexpected name: $volume"
510
            ((skipped_count++))
511
            continue
512
        fi
513

            
514
        VM_NAME[$vmid]="${VM_NAME[$vmid]:-unknown}"
515
        if remove_suspend_volume_by_volid "$vmid" "$volume"; then
516
            log_info "Removed orphan suspend-state volume from $storage: $volume"
517
            ((cleaned_count++))
518
        else
519
            ((fail_count++))
520
        fi
521
    done < <(list_suspend_artifact_files)
522

            
523
    log_success "Suspend artifact cleanup complete: $cleaned_count cleaned, $skipped_count retained, $fail_count failed"
524
    return $fail_count
525
}
526

            
527
unlock_vm_suspend_lock() {
528
    local vmid="$1"
529
    local context="${2:-}"
530
    local name="${VM_NAME[$vmid]:-unknown}"
531
    local unlock_output
532

            
533
    if ! vm_has_suspend_lock "$vmid"; then
534
        return 0
535
    fi
536

            
537
    if [[ $DRY_RUN -eq 1 ]]; then
538
        if [[ -n "$context" ]]; then
539
            echo "would remove stale suspend lock for VM $vmid ($name) $context"
540
        else
541
            echo "would remove stale suspend lock for VM $vmid ($name)"
542
        fi
543
        return 0
544
    fi
545

            
546
    unlock_output=$(qm unlock "$vmid" 2>&1)
547
    if [[ $? -eq 0 ]]; then
548
        if [[ -n "$context" ]]; then
549
            log_info "Removed stale suspend lock for VM $vmid ($name) $context"
550
        else
551
            log_info "Removed stale suspend lock for VM $vmid ($name)"
552
        fi
553
        return 0
554
    fi
555

            
556
    if maybe_relax_quorum "$unlock_output"; then
557
        unlock_output=$(qm unlock "$vmid" 2>&1)
558
        if [[ $? -eq 0 ]]; then
559
            if [[ -n "$context" ]]; then
560
                log_info "Removed stale suspend lock for VM $vmid ($name) $context after quorum recovery"
561
            else
562
                log_info "Removed stale suspend lock for VM $vmid ($name) after quorum recovery"
563
            fi
564
            return 0
565
        fi
566
    fi
567

            
568
    if [[ -n "$context" ]]; then
569
        log_warning "VM $vmid ($name) has a stale suspend lock $context but it could not be removed: $unlock_output"
570
    else
571
        log_warning "VM $vmid ($name) has a stale suspend lock but it could not be removed: $unlock_output"
572
    fi
573
    return 1
574
}
575

            
576
unlock_vm_if_needed() {
577
    unlock_vm_suspend_lock "$1" "while VM is running"
578
}
579

            
580
# Quorum-sensitive operations (qm suspend/start/resume) may fail during
581
# cluster-wide maintenance when pmxcfs becomes read-only. In that case, relax
582
# expected votes once and retry the failed operation.
583
maybe_relax_quorum() {
584
    local cmd_output="$1"
585

            
586
    # Already attempted in this run.
587
    if [[ $QUORUM_RELAXED -eq 1 ]]; then
588
        return 1
589
    fi
590

            
591
    if echo "$cmd_output" | grep -qiE "cluster not ready - no quorum|/etc/pve/.+\\.conf\\.tmp.+(Permission denied|Device or resource busy)"; then
592
        log_warning "Detected quorum-related write failure in /etc/pve - attempting temporary 'pvecm expected 1'"
593
        if pvecm expected 1 >/dev/null 2>&1; then
594
            QUORUM_RELAXED=1
595
            log_warning "Applied 'pvecm expected 1' for this maintenance cycle; retrying operation"
596
            return 0
597
        fi
598
        log_error "Failed to apply 'pvecm expected 1' after quorum-related error"
599
    fi
600

            
601
    return 1
602
}
603

            
604
# Suspend a VM to disk
605
suspend_vm_to_disk() {
606
    local vmid="$1"
607
    local name="${VM_NAME[$vmid]:-unknown}"
608
    local qm_output
609
    local stale_path
610
    local retry_output
611
    local stale_retry_path
612

            
613
    if [[ $DRY_RUN -eq 1 ]]; then
614
        echo "would suspend VM $vmid ($name) to disk"
615
        return 0
616
    fi
617

            
618
    log_info "Suspending VM $vmid ($name) to disk..."
619
    qm_output=$(qm suspend "$vmid" --todisk 1 2>&1)
620
    if [[ $? -eq 0 ]]; then
621
        log_success "VM $vmid ($name) suspended to disk"
622
        return 0
623
    fi
624

            
625
    # Recover from stale suspend image left from a previous interrupted suspend.
626
    # Proxmox can emit either:
627
    #   - "stale saved state disk image ('...raw' already exists)"
628
    #   - "disk image '...raw' already exists"
629
    stale_path=$(
630
        echo "$qm_output" | sed -n \
631
            -e "s/.*stale saved state[[:space:]]*disk image ('\\([^']*\\)' already exists).*/\\1/p" \
632
            -e "s/.*disk image '\\([^']*\\)' already exists.*/\\1/p" | head -n 1
633
    )
634
    if [[ -n "$stale_path" && "$stale_path" =~ /vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ && -f "$stale_path" ]]; then
635
        log_warning "VM $vmid ($name) has stale suspend image: $stale_path - removing and retrying once"
636
        if rm -f -- "$stale_path"; then
637
            retry_output=$(qm suspend "$vmid" --todisk 1 2>&1)
638
            if [[ $? -eq 0 ]]; then
639
                log_success "VM $vmid ($name) suspended to disk (after stale image cleanup)"
640
                return 0
641
            fi
642
            if maybe_relax_quorum "$retry_output"; then
643
                retry_output=$(qm suspend "$vmid" --todisk 1 2>&1)
644
                if [[ $? -eq 0 ]]; then
645
                    log_success "VM $vmid ($name) suspended to disk (after stale image cleanup + quorum recovery)"
646
                    return 0
647
                fi
648
                stale_retry_path=$(
649
                    echo "$retry_output" | sed -n \
650
                        -e "s/.*stale saved state[[:space:]]*disk image ('\\([^']*\\)' already exists).*/\\1/p" \
651
                        -e "s/.*disk image '\\([^']*\\)' already exists.*/\\1/p" | head -n 1
652
                )
653
                if [[ -n "$stale_retry_path" && "$stale_retry_path" =~ /vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ && -f "$stale_retry_path" ]]; then
654
                    log_warning "VM $vmid ($name) retry left stale suspend image: $stale_retry_path - removing and retrying once more"
655
                    if rm -f -- "$stale_retry_path"; then
656
                        retry_output=$(qm suspend "$vmid" --todisk 1 2>&1)
657
                        if [[ $? -eq 0 ]]; then
658
                            log_success "VM $vmid ($name) suspended to disk (after stale image cleanup + quorum recovery + retry)"
659
                            return 0
660
                        fi
661
                    fi
662
                fi
663
            fi
664
            log_error "Failed to suspend VM $vmid ($name) after stale image cleanup: $retry_output"
665
            return 1
666
        fi
667
        log_error "Failed to remove stale suspend image for VM $vmid ($name): $stale_path"
668
        return 1
669
    fi
670

            
671
    if maybe_relax_quorum "$qm_output"; then
672
        retry_output=$(qm suspend "$vmid" --todisk 1 2>&1)
673
        if [[ $? -eq 0 ]]; then
674
            log_success "VM $vmid ($name) suspended to disk (after quorum recovery)"
675
            return 0
676
        fi
677
        stale_retry_path=$(
678
            echo "$retry_output" | sed -n \
679
                -e "s/.*stale saved state[[:space:]]*disk image ('\\([^']*\\)' already exists).*/\\1/p" \
680
                -e "s/.*disk image '\\([^']*\\)' already exists.*/\\1/p" | head -n 1
681
        )
682
        if [[ -n "$stale_retry_path" && "$stale_retry_path" =~ /vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ && -f "$stale_retry_path" ]]; then
683
            log_warning "VM $vmid ($name) quorum retry hit stale suspend image: $stale_retry_path - removing and retrying once more"
684
            if rm -f -- "$stale_retry_path"; then
685
                retry_output=$(qm suspend "$vmid" --todisk 1 2>&1)
686
                if [[ $? -eq 0 ]]; then
687
                    log_success "VM $vmid ($name) suspended to disk (after quorum recovery + stale retry)"
688
                    return 0
689
                fi
690
            fi
691
        fi
692
        log_error "Failed to suspend VM $vmid ($name) after quorum recovery: $retry_output"
693
        return 1
694
    fi
695

            
696
    log_error "Failed to suspend VM $vmid ($name) to disk: $qm_output"
697
    return 1
698
}
699

            
700
# Resume a VM from disk suspend
701
resume_vm() {
702
    local vmid="$1"
703
    local name="${VM_NAME[$vmid]:-unknown}"
704
    local qm_output
705
    local current_status
706

            
707
    if [[ $DRY_RUN -eq 1 ]]; then
708
        echo "would resume VM $vmid ($name)"
709
        return 0
710
    fi
711

            
712
    log_info "Resuming VM $vmid ($name)..."
713
    qm_output=$(qm resume "$vmid" 2>&1)
714
    if [[ $? -eq 0 ]]; then
715
        unlock_vm_if_needed "$vmid"
716
        log_success "VM $vmid ($name) resumed successfully"
717
        return 0
718
    fi
719

            
720
    if maybe_relax_quorum "$qm_output"; then
721
        qm_output=$(qm resume "$vmid" 2>&1)
722
        if [[ $? -eq 0 ]]; then
723
            unlock_vm_if_needed "$vmid"
724
            log_success "VM $vmid ($name) resumed successfully (after quorum recovery)"
725
            return 0
726
        fi
727
        current_status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}')
728
        if [[ "$current_status" == "running" ]]; then
729
            unlock_vm_if_needed "$vmid"
730
            log_warning "VM $vmid ($name) is running despite resume error after quorum recovery - treating as resumed"
731
            return 2
732
        fi
733
        log_error "Failed to resume VM $vmid ($name) after quorum recovery: $qm_output"
734
        return 1
735
    fi
736

            
737
    if echo "$qm_output" | grep -qi "already running"; then
738
        unlock_vm_if_needed "$vmid"
739
        log_warning "VM $vmid ($name) is already running - treating as resumed"
740
        return 2
741
    fi
742

            
743
    current_status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}')
744
    if [[ "$current_status" == "running" ]]; then
745
        unlock_vm_if_needed "$vmid"
746
        log_warning "VM $vmid ($name) is running despite resume error - treating as resumed"
747
        return 2
748
    fi
749

            
750
    log_error "Failed to resume VM $vmid ($name): $qm_output"
751
    return 1
752
}
753

            
754
# Graceful shutdown a CT
755
shutdown_ct() {
756
    local ctid="$1"
757
    local name="${CT_NAME[$ctid]:-unknown}"
758

            
759
    if [[ $DRY_RUN -eq 1 ]]; then
760
        echo "would shutdown CT $ctid ($name)"
761
        return 0
762
    fi
763

            
764
    log_info "Shutting down CT $ctid ($name)..."
765
    if pct shutdown "$ctid" --timeout 120; then
766
        log_success "CT $ctid ($name) shut down gracefully"
767
        return 0
768
    else
769
        log_error "Failed to shutdown CT $ctid ($name)"
770
        return 1
771
    fi
772
}
773

            
774
# Start a CT
775
start_ct() {
776
    local ctid="$1"
777
    local name="${CT_NAME[$ctid]:-unknown}"
778
    local pct_output
779

            
780
    if [[ $DRY_RUN -eq 1 ]]; then
781
        echo "would start CT $ctid ($name)"
782
        return 0
783
    fi
784

            
785
    log_info "Starting CT $ctid ($name)..."
786
    pct_output=$(pct start "$ctid" 2>&1)
787
    if [[ $? -eq 0 ]]; then
788
        log_success "CT $ctid ($name) started successfully"
789
        return 0
790
    fi
791

            
792
    if maybe_relax_quorum "$pct_output"; then
793
        pct_output=$(pct start "$ctid" 2>&1)
794
        if [[ $? -eq 0 ]]; then
795
            log_success "CT $ctid ($name) started successfully (after quorum recovery)"
796
            return 0
797
        fi
798
        if [[ "$(pct status "$ctid" 2>/dev/null | awk '{print $2}')" == "running" ]]; then
799
            log_warning "CT $ctid ($name) is running despite start error after quorum recovery - treating as started"
800
            return 2
801
        fi
802
        log_error "Failed to start CT $ctid ($name) after quorum recovery: $pct_output"
803
        return 1
804
    fi
805

            
806
    if echo "$pct_output" | grep -qi "already running"; then
807
        log_warning "CT $ctid ($name) is already running - treating as started"
808
        return 2
809
    fi
810

            
811
    if [[ "$(pct status "$ctid" 2>/dev/null | awk '{print $2}')" == "running" ]]; then
812
        log_warning "CT $ctid ($name) is running despite start error - treating as started"
813
        return 2
814
    fi
815

            
816
    log_error "Failed to start CT $ctid ($name): $pct_output"
817
    return 1
818
}
819

            
820
# Save state to JSON file
821
# Usage: save_state vm_resume_array vm_suspended_array ct_start_array
822
save_state() {
823
    local -n to_resume_ref=$1
824
    local -n was_suspended_ref=$2
825
    local -n ct_to_start_ref=$3
826
    local existing_state_json=""
827
    local existing_to_resume=()
828
    local existing_was_suspended=()
829
    local existing_ct_to_start=()
830
    local final_to_resume=()
831
    local final_was_suspended=()
832
    local final_ct_to_start=()
833
    local vmid
834
    local volume
835
    local suspend_date
836
    local -A existing_vm_volume=()
837
    local -A existing_vm_date=()
838
    local -A current_vm_volume=()
839
    local -A current_vm_date=()
840

            
841
    if [[ $DRY_RUN -eq 1 ]]; then
842
        echo "would save state to $STATE_FILE"
843
        echo "  to_resume (VMs): ${to_resume_ref[*]}"
844
        echo "  was_suspended (VMs): ${was_suspended_ref[*]}"
845
        echo "  ct_to_start (CTs): ${ct_to_start_ref[*]}"
846
        return 0
847
    fi
848

            
849
    if existing_state_json=$(load_state 2>/dev/null); then
850
        mapfile -t existing_to_resume < <(echo "$existing_state_json" | jq -r '.to_resume[]?' 2>/dev/null)
851
        mapfile -t existing_was_suspended < <(echo "$existing_state_json" | jq -r '.was_suspended[]?' 2>/dev/null)
852
        mapfile -t existing_ct_to_start < <(echo "$existing_state_json" | jq -r '.ct_to_start[]?' 2>/dev/null)
853
        while IFS=$'\t' read -r vmid volume suspend_date; do
854
            [[ -z "$vmid" ]] && continue
855
            existing_vm_volume[$vmid]="$volume"
856
            existing_vm_date[$vmid]="$suspend_date"
857
        done < <(
858
            echo "$existing_state_json" | jq -r '
859
                (.vm_details // {})
860
                | to_entries[]
861
                | [.key, (.value.suspend_volume // ""), (.value.suspend_file_date // "")]
862
                | @tsv
863
            ' 2>/dev/null
864
        )
865
    fi
866

            
867
    refresh_vm_artifact_metadata
868

            
869
    for vmid in "${to_resume_ref[@]}"; do
870
        append_unique final_to_resume "$vmid"
871
        volume="${VM_VMSTATE[$vmid]:-}"
872
        suspend_date=$(extract_suspend_file_date "$vmid" "$volume")
873
        current_vm_volume[$vmid]="$volume"
874
        current_vm_date[$vmid]="$suspend_date"
875
    done
876

            
877
    for vmid in "${existing_to_resume[@]}"; do
878
        append_unique final_to_resume "$vmid"
879
    done
880

            
881
    for vmid in "${existing_was_suspended[@]}"; do
882
        if ! array_contains "$vmid" "${final_to_resume[@]}"; then
883
            append_unique final_was_suspended "$vmid"
884
        fi
885
    done
886

            
887
    for vmid in "${was_suspended_ref[@]}"; do
888
        if array_contains "$vmid" "${final_to_resume[@]}"; then
889
            volume="${VM_VMSTATE[$vmid]:-}"
890
            if [[ -n "$volume" ]]; then
891
                current_vm_volume[$vmid]="$volume"
892
                current_vm_date[$vmid]="$(extract_suspend_file_date "$vmid" "$volume")"
893
            fi
894
            continue
895
        fi
896
        append_unique final_was_suspended "$vmid"
897
        volume="${VM_VMSTATE[$vmid]:-}"
898
        suspend_date=$(extract_suspend_file_date "$vmid" "$volume")
899
        current_vm_volume[$vmid]="$volume"
900
        current_vm_date[$vmid]="$suspend_date"
901
    done
902

            
903
    for vmid in "${final_to_resume[@]}"; do
904
        remove_value final_was_suspended "$vmid"
905
    done
906

            
907
    for vmid in "${existing_ct_to_start[@]}"; do
908
        append_unique final_ct_to_start "$vmid"
909
    done
910
    for vmid in "${ct_to_start_ref[@]}"; do
911
        append_unique final_ct_to_start "$vmid"
912
    done
913

            
914
    # Create JSON arrays (handle empty arrays properly)
915
    local to_resume_json="[]"
916
    local was_suspended_json="[]"
917
    local ct_to_start_json="[]"
918
    local vm_details_json="{}"
919

            
920
    if [[ ${#final_to_resume[@]} -gt 0 ]]; then
921
        to_resume_json=$(printf '%s\n' "${final_to_resume[@]}" | jq -R . | jq -s .)
922
    fi
923
    if [[ ${#final_was_suspended[@]} -gt 0 ]]; then
924
        was_suspended_json=$(printf '%s\n' "${final_was_suspended[@]}" | jq -R . | jq -s .)
925
    fi
926
    if [[ ${#final_ct_to_start[@]} -gt 0 ]]; then
927
        ct_to_start_json=$(printf '%s\n' "${final_ct_to_start[@]}" | jq -R . | jq -s .)
928
    fi
929

            
930
    for vmid in "${final_to_resume[@]}"; do
931
        volume="${current_vm_volume[$vmid]:-${existing_vm_volume[$vmid]:-}}"
932
        suspend_date="${current_vm_date[$vmid]:-${existing_vm_date[$vmid]:-}}"
933
        vm_details_json=$(
934
            jq \
935
                --arg vmid "$vmid" \
936
                --arg mode "to_resume" \
937
                --arg volume "$volume" \
938
                --arg suspend_date "$suspend_date" \
939
                '
940
                .[$vmid] = {
941
                    mode: $mode,
942
                    suspend_volume: $volume,
943
                    suspend_file_date: $suspend_date
944
                }
945
                ' <<<"$vm_details_json"
946
        )
947
    done
948

            
949
    for vmid in "${final_was_suspended[@]}"; do
950
        volume="${current_vm_volume[$vmid]:-${existing_vm_volume[$vmid]:-}}"
951
        suspend_date="${current_vm_date[$vmid]:-${existing_vm_date[$vmid]:-}}"
952
        vm_details_json=$(
953
            jq \
954
                --arg vmid "$vmid" \
955
                --arg mode "was_suspended" \
956
                --arg volume "$volume" \
957
                --arg suspend_date "$suspend_date" \
958
                '
959
                .[$vmid] = {
960
                    mode: $mode,
961
                    suspend_volume: $volume,
962
                    suspend_file_date: $suspend_date
963
                }
964
                ' <<<"$vm_details_json"
965
        )
966
    done
967

            
968
    cat > "$STATE_FILE" <<EOF
969
{
970
    "timestamp": "$(date -Iseconds)",
971
    "hostname": "$(hostname)",
972
    "to_resume": $to_resume_json,
973
    "was_suspended": $was_suspended_json,
974
    "ct_to_start": $ct_to_start_json,
975
    "vm_details": $vm_details_json
976
}
977
EOF
978

            
979
    log_info "State saved to $STATE_FILE"
980
}
981

            
982
# Load state from JSON file (outputs JSON only, no logging to avoid capture issues)
983
load_state() {
984
    if [[ ! -f "$STATE_FILE" ]]; then
985
        return 1
986
    fi
987
    cat "$STATE_FILE"
988
}
989

            
990
# Remove state file after resume is complete
991
clear_state() {
992
    if [[ $DRY_RUN -eq 1 ]]; then
993
        echo "would remove state file $STATE_FILE"
994
        return 0
995
    fi
996

            
997
    if [[ -f "$STATE_FILE" ]]; then
998
        rm -f "$STATE_FILE"
999
        log_info "State file removed"
1000
    fi
1001
}
1002

            
1003
migrate_legacy_state_if_needed() {
1004
    if [[ "${STATE_FILE}" == "${LEGACY_STATE_FILE}" ]]; then
1005
        return 0
1006
    fi
1007

            
1008
    if [[ -f "${LEGACY_STATE_FILE}" && ! -f "${STATE_FILE}" ]]; then
1009
        mkdir -p "${STATE_DIR}"
1010
        mv "${LEGACY_STATE_FILE}" "${STATE_FILE}"
1011
        log_warning "Migrated legacy state file from ${LEGACY_STATE_FILE} to ${STATE_FILE}"
1012
    fi
1013
}
1014

            
1015
# Main suspend operation
1016
do_suspend() {
1017
    log_info "Starting suspend/shutdown operation on $(hostname)"
1018

            
1019
    # Clean stale suspend artifacts before creating new suspend volumes.
1020
    load_vm_config_metadata
1021
    if ! cleanup_orphan_suspend_artifacts; then
1022
        log_warning "Suspend artifact preflight cleanup had failures; continuing with suspend operation"
1023
    fi
1024

            
1025
    # Load all VM and CT info in one pass
1026
    load_vm_info
1027
    load_ct_info
1028

            
1029
    local to_resume=()
1030
    local was_suspended=()
1031
    local ct_to_start=()
1032
    local suspend_count=0
1033
    local skip_count=0
1034
    local fail_count=0
1035

            
1036
    # --- Process QEMU VMs ---
1037
    log_info "Processing QEMU VMs..."
1038
    for conf in /etc/pve/qemu-server/*.conf; do
1039
        [[ ! -f "$conf" ]] && continue
1040

            
1041
        local vmid=$(basename "$conf" .conf)
1042
        local name="${VM_NAME[$vmid]:-unknown}"
1043
        local status="${VM_STATUS[$vmid]:-stopped}"
1044

            
1045
        case "$status" in
1046
            running)
1047
                # Running VM: suspend to disk, add to resume list
1048
                if suspend_vm_to_disk "$vmid"; then
1049
                    to_resume+=("$vmid")
1050
                    ((suspend_count++))
1051
                else
1052
                    ((fail_count++))
1053
                fi
1054
                ;;
1055
            suspended)
1056
                # Suspended to RAM: save state to disk but DON'T add to resume list
1057
                log_warning "VM $vmid ($name) is suspended to RAM - saving to disk but will NOT auto-resume (was manually suspended)"
1058
                if suspend_vm_to_disk "$vmid"; then
1059
                    was_suspended+=("$vmid")
1060
                    ((suspend_count++))
1061
                else
1062
                    ((fail_count++))
1063
                fi
1064
                ;;
1065
            stopped)
1066
                # Could be stopped normally or suspended to disk
1067
                if vm_has_valid_suspend_state "$vmid"; then
1068
                    log_warning "VM $vmid ($name) is already suspended to disk - will NOT auto-resume"
1069
                    was_suspended+=("$vmid")
1070
                    ((skip_count++))
1071
                elif vm_has_suspend_lock "$vmid" || vm_has_vmstate_reference "$vmid"; then
1072
                    log_warning "VM $vmid ($name) has inconsistent suspend artifacts - treating them as stale"
1073
                    if cleanup_stale_suspend_artifacts "$vmid" "while VM is stopped"; then
1074
                        ((skip_count++))
1075
                    else
1076
                        ((fail_count++))
1077
                    fi
1078
                else
1079
                    log_info "VM $vmid ($name) is stopped, skipping"
1080
                fi
1081
                ;;
1082
            paused)
1083
                # Paused/suspended to RAM: save state to disk but DON'T auto-resume
1084
                log_warning "VM $vmid ($name) is paused/suspended to RAM - saving to disk but will NOT auto-resume (was manually paused)"
1085
                if suspend_vm_to_disk "$vmid"; then
1086
                    was_suspended+=("$vmid")
1087
                    ((suspend_count++))
1088
                else
1089
                    ((fail_count++))
1090
                fi
1091
                ;;
1092
            *)
1093
                log_info "VM $vmid ($name) status '$status', skipping"
1094
                ;;
1095
        esac
1096
    done
1097

            
1098
    # --- Process LXC Containers ---
1099
    log_info "Processing LXC containers..."
1100
    for conf in /etc/pve/lxc/*.conf; do
1101
        [[ ! -f "$conf" ]] && continue
1102

            
1103
        local ctid=$(basename "$conf" .conf)
1104
        local name="${CT_NAME[$ctid]:-unknown}"
1105
        local status="${CT_STATUS[$ctid]:-stopped}"
1106

            
1107
        case "$status" in
1108
            running)
1109
                # Running CT: graceful shutdown, add to start list
1110
                if shutdown_ct "$ctid"; then
1111
                    ct_to_start+=("$ctid")
1112
                    ((suspend_count++))
1113
                else
1114
                    ((fail_count++))
1115
                fi
1116
                ;;
1117
            stopped)
1118
                log_info "CT $ctid ($name) is stopped, skipping"
1119
                ;;
1120
            *)
1121
                log_info "CT $ctid ($name) status '$status', skipping"
1122
                ;;
1123
        esac
1124
    done
1125

            
1126
    # Save state
1127
    save_state to_resume was_suspended ct_to_start
1128

            
1129
    # Summary
1130
    log_success "Suspend/shutdown complete: $suspend_count processed, $skip_count skipped, $fail_count failed"
1131
    log_info "VMs to auto-resume: ${to_resume[*]:-none}"
1132
    log_info "VMs NOT to auto-resume (were suspended): ${was_suspended[*]:-none}"
1133
    log_info "CTs to auto-start: ${ct_to_start[*]:-none}"
1134

            
1135
    return $fail_count
1136
}
1137

            
1138
do_cleanup() {
1139
    log_info "Starting suspend artifact cleanup on $(hostname)"
1140

            
1141
    load_vm_config_metadata
1142
    cleanup_orphan_suspend_artifacts
1143
    return $?
1144
}
1145

            
1146
# Main resume operation
1147
do_resume() {
1148
    log_info "Starting resume/start operation on $(hostname)"
1149

            
1150
    # Load all VM and CT info in one pass
1151
    load_vm_info
1152
    load_ct_info
1153

            
1154
    local state_json
1155
    state_json=$(load_state)
1156
    if [[ $? -ne 0 ]]; then
1157
        log_warning "No saved state - nothing to resume"
1158
        return 0
1159
    fi
1160

            
1161
    # Parse state file
1162
    local to_resume=($(echo "$state_json" | jq -r '.to_resume[]' 2>/dev/null))
1163
    local was_suspended=($(echo "$state_json" | jq -r '.was_suspended[]' 2>/dev/null))
1164
    local ct_to_start=($(echo "$state_json" | jq -r '.ct_to_start[]' 2>/dev/null))
1165
    local saved_timestamp=$(echo "$state_json" | jq -r '.timestamp' 2>/dev/null)
1166
    local -A saved_vm_volume=()
1167
    local -A saved_vm_date=()
1168
    local saved_volume
1169
    local current_volume
1170

            
1171
    while IFS=$'\t' read -r vmid saved_volume saved_date; do
1172
        [[ -z "$vmid" ]] && continue
1173
        saved_vm_volume[$vmid]="$saved_volume"
1174
        saved_vm_date[$vmid]="$saved_date"
1175
    done < <(
1176
        echo "$state_json" | jq -r '
1177
            (.vm_details // {})
1178
            | to_entries[]
1179
            | [.key, (.value.suspend_volume // ""), (.value.suspend_file_date // "")]
1180
            | @tsv
1181
        ' 2>/dev/null
1182
    )
1183

            
1184
    log_info "State file from: $saved_timestamp"
1185

            
1186
    local resume_count=0
1187
    local skip_count=0
1188
    local fail_count=0
1189

            
1190
    # --- Resume QEMU VMs ---
1191

            
1192
    # Log warnings for VMs that won't be resumed
1193
    for vmid in "${was_suspended[@]}"; do
1194
        local name="${VM_NAME[$vmid]:-unknown}"
1195
        log_warning "VM $vmid ($name) was already suspended before maintenance - NOT auto-resuming"
1196
        ((skip_count++))
1197
    done
1198

            
1199
    # Resume VMs that should be resumed
1200
    for vmid in "${to_resume[@]}"; do
1201
        local name="${VM_NAME[$vmid]:-unknown}"
1202

            
1203
        # Verify VM still exists and has suspend lock
1204
        if [[ ! -f "/etc/pve/qemu-server/${vmid}.conf" ]]; then
1205
            log_error "VM $vmid config not found - skipping"
1206
            ((fail_count++))
1207
            continue
1208
        fi
1209

            
1210
        if [[ -z "${VM_HAS_LOCK[$vmid]}" ]]; then
1211
            log_warning "VM $vmid ($name) no longer has suspend lock - may have been manually resumed"
1212
            ((skip_count++))
1213
            continue
1214
        fi
1215

            
1216
        saved_volume="${saved_vm_volume[$vmid]:-}"
1217
        current_volume="${VM_VMSTATE[$vmid]:-}"
1218
        if [[ -n "$saved_volume" && "$current_volume" != "$saved_volume" ]]; then
1219
            log_warning "VM $vmid ($name) suspend volume changed since state file (${saved_vm_date[$vmid]:-unknown date}): saved=$saved_volume current=${current_volume:-none} - skipping auto-resume"
1220
            ((skip_count++))
1221
            continue
1222
        fi
1223

            
1224
        resume_vm "$vmid"
1225
        case $? in
1226
            0) ((resume_count++)) ;;
1227
            2) ((skip_count++)) ;;
1228
            *) ((fail_count++)) ;;
1229
        esac
1230
    done
1231

            
1232
    # --- Start LXC Containers ---
1233
    for ctid in "${ct_to_start[@]}"; do
1234
        local name="${CT_NAME[$ctid]:-unknown}"
1235

            
1236
        # Verify CT still exists
1237
        if [[ ! -f "/etc/pve/lxc/${ctid}.conf" ]]; then
1238
            log_error "CT $ctid config not found - skipping"
1239
            ((fail_count++))
1240
            continue
1241
        fi
1242

            
1243
        # Check if already running (someone started it manually)
1244
        if [[ "${CT_STATUS[$ctid]}" == "running" ]]; then
1245
            log_warning "CT $ctid ($name) is already running - skipping"
1246
            ((skip_count++))
1247
            continue
1248
        fi
1249

            
1250
        start_ct "$ctid"
1251
        case $? in
1252
            0) ((resume_count++)) ;;
1253
            2) ((skip_count++)) ;;
1254
            *) ((fail_count++)) ;;
1255
        esac
1256
    done
1257

            
1258
    # Clear state file only on full success; keep it for retry if any failures.
1259
    if [[ $fail_count -eq 0 ]]; then
1260
        clear_state
1261
    else
1262
        log_warning "Resume/start encountered failures - keeping state file for retry"
1263
    fi
1264

            
1265
    # Summary
1266
    log_success "Resume/start complete: $resume_count restored, $skip_count skipped, $fail_count failed"
1267

            
1268
    return $fail_count
1269
}
1270

            
1271
# Acquire lock to prevent concurrent runs
1272
acquire_lock() {
1273
    if [[ $DRY_RUN -eq 1 ]]; then
1274
        return 0
1275
    fi
1276

            
1277
    if [[ -f "$LOCK_FILE" ]]; then
1278
        local pid=$(cat "$LOCK_FILE" 2>/dev/null)
1279
        if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
1280
            log_error "Another instance is running (PID $pid)"
1281
            exit 1
1282
        fi
1283
        # Stale lock file
1284
        rm -f "$LOCK_FILE"
1285
    fi
1286

            
1287
    echo $$ > "$LOCK_FILE"
1288
    trap "rm -f '$LOCK_FILE'" EXIT
1289
}
1290

            
1291
# Parse command line
1292
COMMAND=""
1293
while [[ $# -gt 0 ]]; do
1294
    case "$1" in
1295
        suspend|resume|cleanup)
1296
            COMMAND="$1"
1297
            shift
1298
            ;;
1299
        -n|--dry-run)
1300
            DRY_RUN=1
1301
            shift
1302
            ;;
1303
        -v|--verbose)
1304
            ((VERBOSE++))
1305
            shift
1306
            ;;
1307
        -vv)
1308
            VERBOSE=2
1309
            shift
1310
            ;;
1311
        -h|--help)
1312
            usage
1313
            exit 0
1314
            ;;
1315
        *)
1316
            echo "Unknown option: $1" >&2
1317
            usage
1318
            exit 1
1319
            ;;
1320
    esac
1321
done
1322

            
1323
if [[ -z "$COMMAND" ]]; then
1324
    echo "Error: No command specified" >&2
1325
    usage
1326
    exit 1
1327
fi
1328

            
1329
# Ensure state directory exists
1330
mkdir -p "$STATE_DIR"
1331

            
1332
# Migrate state from the legacy location used by older installs.
1333
migrate_legacy_state_if_needed
1334

            
1335
# Acquire lock
1336
acquire_lock
1337

            
1338
# Execute command
1339
case "$COMMAND" in
1340
    suspend)
1341
        do_suspend
1342
        exit $?
1343
        ;;
1344
    resume)
1345
        do_resume
1346
        exit $?
1347
        ;;
1348
    cleanup)
1349
        do_cleanup
1350
        exit $?
1351
        ;;
1352
esac