#!/bin/bash # pgs # Manages VM and CT suspend/shutdown for planned maintenance. # # Before maintenance (suspend mode): # - Suspends all running VMs to disk # - Gracefully shuts down all running CTs # - Saves state to a list for restoration # - VMs already suspended to disk: logged as warning, not auto-resumed # - VMs suspended to RAM: suspended to disk but not auto-resumed (preserving user intent) # # After maintenance (resume mode): # - Resumes VMs from the saved list # - Starts CTs from the saved list # - Logs warnings for VMs/CTs skipped # - Logs errors for VMs/CTs that fail to resume/start # # Usage: pgs suspend|resume [--dry-run] [-v] # # Version: 1.4 - Standardized xdev state path with legacy state migration # # TODO: Implement critical VM/CT migration support. # Critical guests (tagged or listed) should be live-migrated to another # node before maintenance instead of suspended/stopped. Rules TBD: # - Which guests are critical (tag? config flag? external list?) # - Target node selection (least loaded? affinity rules?) # - Fallback if migration fails (suspend locally?) # - Post-maintenance: migrate back or leave on target node? PROJECT_ID="pve-guests-state" ORG_ID="xdev" DEFAULT_STATE_DIR="/var/lib/${ORG_ID}/${PROJECT_ID}" LEGACY_STATE_DIR="/var/lib/pve-manager" LEGACY_STATE_FILE="${LEGACY_STATE_DIR}/pgs-state.json" STATE_DIR="${PGS_STATE_DIR:-${DEFAULT_STATE_DIR}}" STATE_FILE="${STATE_DIR}/pgs-state.json" LOCK_FILE="/run/pgs.lock" SCRIPT_NAME=$(basename "$0") DRY_RUN=0 VERBOSE=0 QUORUM_RELAXED=0 # Associative arrays for VM data (populated once) declare -A VM_STATUS declare -A VM_NAME declare -A VM_HAS_LOCK declare -A VM_VMSTATE declare -A VMSTATE_TO_VMID # Associative arrays for CT data (populated once) declare -A CT_STATUS declare -A CT_NAME # Logging functions. # When running inside systemd (JOURNAL_STREAM is set), stdout goes directly to # the journal - calling logger in addition causes duplicate entries. When running # interactively, use both echo (terminal) and logger (journal archive). _log() { local level="$1" prefix="$2"; shift 2 echo "$prefix $*" [[ -z "${JOURNAL_STREAM:-}" ]] && logger -t "$SCRIPT_NAME" -p "$level" "$*" } log_info() { # When in systemd: always log regardless of VERBOSE (journal is the destination) # When interactive: only log if -v is set if [[ -n "${JOURNAL_STREAM:-}" ]] || [[ $VERBOSE -ge 1 ]]; then _log user.info "[INFO]" "$@" fi } log_debug() { if [[ -n "${JOURNAL_STREAM:-}" ]] || [[ $VERBOSE -ge 2 ]]; then _log user.debug "[DEBUG]" "$@" fi } log_warning() { _log user.warning "[WARNING]" "$@" } log_error() { echo "[ERROR] $*" >&2 [[ -z "${JOURNAL_STREAM:-}" ]] && logger -t "$SCRIPT_NAME" -p user.err "$*" } log_success() { _log user.notice "[SUCCESS]" "$@" } usage() { cat </dev/null; then VM_HAS_LOCK[$vmid]=1 fi local vmstate vmstate=$(awk -F': ' '/^vmstate: / {print $2; exit}' "$conf" 2>/dev/null) if [[ -n "$vmstate" ]]; then VM_VMSTATE[$vmid]="$vmstate" VMSTATE_TO_VMID[$vmstate]="$vmid" fi done } load_vm_config_metadata() { VM_STATUS=() VM_NAME=() while read -r vmid name status _rest; do [[ "$vmid" == "VMID" ]] && continue VM_NAME[$vmid]="$name" done < <(qm list 2>/dev/null) refresh_vm_artifact_metadata } # Load all VM info in one pass - FAST load_vm_info() { load_vm_config_metadata # Get status and name from qm list (single call) while read -r vmid name status _rest; do [[ "$vmid" == "VMID" ]] && continue # skip header VM_STATUS[$vmid]="$status" VM_NAME[$vmid]="$name" done < <(qm list 2>/dev/null) # For "running" VMs, get actual status (qm list shows "running" for paused/suspended VMs) # This is only a few VMs so the overhead is acceptable for vmid in "${!VM_STATUS[@]}"; do if [[ "${VM_STATUS[$vmid]}" == "running" ]]; then local real_status real_status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') [[ -n "$real_status" ]] && VM_STATUS[$vmid]="$real_status" fi done } array_contains() { local needle="$1" shift local item for item in "$@"; do [[ "$item" == "$needle" ]] && return 0 done return 1 } append_unique() { local -n target_ref=$1 local value="$2" array_contains "$value" "${target_ref[@]}" || target_ref+=("$value") } remove_value() { local -n target_ref=$1 local value="$2" local filtered=() local item for item in "${target_ref[@]}"; do [[ "$item" == "$value" ]] && continue filtered+=("$item") done target_ref=("${filtered[@]}") } extract_suspend_file_date() { local vmid="$1" local volume="$2" local volume_name="${volume##*/}" if [[ "$volume_name" =~ ^vm-${vmid}-state-suspend-([0-9]{4}-[0-9]{2}-[0-9]{2})\.raw$ ]]; then echo "${BASH_REMATCH[1]}" fi } # Load all CT info in one pass - FAST load_ct_info() { # pct list columns: VMID Status Lock Name # When Lock is empty, read shifts Name into the lock variable while read -r vmid status lock name; do [[ "$vmid" == "VMID" ]] && continue # skip header if [[ -z "$name" ]]; then # No lock present: lock actually holds the name name="$lock" lock="" fi CT_STATUS[$vmid]="$status" CT_NAME[$vmid]="$name" done < <(pct list 2>/dev/null) } # Get VM name (from cache) get_vm_name() { echo "${VM_NAME[$1]:-unknown}" } vm_has_suspend_lock() { local vmid="$1" grep -q '^lock: suspended$' "/etc/pve/qemu-server/${vmid}.conf" 2>/dev/null } vm_has_vmstate_reference() { local vmid="$1" grep -q '^vmstate:' "/etc/pve/qemu-server/${vmid}.conf" 2>/dev/null } get_vm_vmstate_volume() { local vmid="$1" echo "${VM_VMSTATE[$vmid]:-}" } is_strict_suspend_volume_name() { local vmid="$1" local name="$2" [[ "$name" =~ ^vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ ]] } storage_cleanup_supports_path_scan() { local storage_type="$1" # Cleanup walks filesystem paths directly under /images. # Keep this limited to local directory-backed storages so a stale remote # mount cannot block planned maintenance in kernel I/O wait. [[ "$storage_type" == "dir" ]] } vmstate_volume_looks_like_suspend_artifact() { local vmid="$1" local volume="$2" local volume_name="${volume##*/}" [[ -n "$volume" ]] || return 1 is_strict_suspend_volume_name "$vmid" "$volume_name" } resolve_storage_volume_path() { local volume="$1" pvesm path "$volume" 2>/dev/null } vmstate_volume_exists() { local volume="$1" local resolved_path [[ -z "$volume" ]] && return 1 resolved_path=$(resolve_storage_volume_path "$volume") || return 1 [[ -n "$resolved_path" && -e "$resolved_path" ]] } remove_suspend_volume_by_volid() { local vmid="$1" local volume="$2" local name="${VM_NAME[$vmid]:-unknown}" local free_output if ! vmstate_volume_looks_like_suspend_artifact "$vmid" "$volume"; then log_warning "VM $vmid ($name) suspend volume does not look like a suspend artifact, leaving it untouched: ${volume:-none}" return 1 fi if [[ $DRY_RUN -eq 1 ]]; then echo "would remove stale vmstate volume for VM $vmid ($name): $volume" return 0 fi free_output=$(pvesm free "$volume" 2>&1) if [[ $? -eq 0 ]]; then log_info "Removed stale vmstate volume for VM $vmid ($name): $volume" return 0 fi if maybe_relax_quorum "$free_output"; then free_output=$(pvesm free "$volume" 2>&1) if [[ $? -eq 0 ]]; then log_info "Removed stale vmstate volume for VM $vmid ($name) after quorum recovery: $volume" return 0 fi fi if echo "$free_output" | grep -qiE 'does not exist|no such file|not found'; then log_info "Stale vmstate volume for VM $vmid ($name) was already absent: $volume" return 0 fi log_warning "VM $vmid ($name) stale vmstate volume could not be removed: $volume ($free_output)" return 1 } clear_vmstate_metadata() { local vmid="$1" local name="${VM_NAME[$vmid]:-unknown}" local set_output if [[ $DRY_RUN -eq 1 ]]; then echo "would remove stale vmstate metadata for VM $vmid ($name)" return 0 fi set_output=$(qm set "$vmid" --delete vmstate 2>&1) if [[ $? -eq 0 ]]; then log_info "Removed stale vmstate metadata for VM $vmid ($name)" return 0 fi if maybe_relax_quorum "$set_output"; then set_output=$(qm set "$vmid" --delete vmstate 2>&1) if [[ $? -eq 0 ]]; then log_info "Removed stale vmstate metadata for VM $vmid ($name) after quorum recovery" return 0 fi fi log_warning "VM $vmid ($name) stale vmstate metadata could not be removed: $set_output" return 1 } free_stale_vmstate_volume() { local vmid="$1" local volume="$2" remove_suspend_volume_by_volid "$vmid" "$volume" } cleanup_stale_suspend_artifacts() { local vmid="$1" local context="${2:-}" local name="${VM_NAME[$vmid]:-unknown}" local volume local had_issue=0 local cleanup_failed=0 volume=$(get_vm_vmstate_volume "$vmid") if vm_has_suspend_lock "$vmid"; then had_issue=1 if ! unlock_vm_suspend_lock "$vmid" "$context"; then cleanup_failed=1 fi fi if [[ -n "$volume" ]]; then had_issue=1 if vmstate_volume_exists "$volume"; then if ! free_stale_vmstate_volume "$vmid" "$volume"; then cleanup_failed=1 fi else log_info "VM $vmid ($name) has stale vmstate metadata pointing to missing volume: $volume" fi if ! clear_vmstate_metadata "$vmid"; then cleanup_failed=1 fi fi if [[ $had_issue -eq 0 ]]; then return 0 fi [[ $cleanup_failed -eq 0 ]] } vm_has_valid_suspend_state() { local vmid="$1" local volume vm_has_suspend_lock "$vmid" || return 1 vm_has_vmstate_reference "$vmid" || return 1 volume=$(get_vm_vmstate_volume "$vmid") vmstate_volume_looks_like_suspend_artifact "$vmid" "$volume" || return 1 vmstate_volume_exists "$volume" } get_referencing_vmid_for_vmstate() { local target_volume="$1" local vmid="${VMSTATE_TO_VMID[$target_volume]:-}" [[ -n "$vmid" ]] || return 1 echo "$vmid" return 0 } list_suspend_artifact_files() { awk ' BEGIN { RS = "" FS = "\n" } { type = "" name = "" path = "" content = "" split($1, header_parts, /:[[:space:]]+/) if (length(header_parts) >= 2) { type = header_parts[1] name = header_parts[2] } for (i = 2; i <= NF; i++) { line = $i sub(/^\t/, "", line) if (line ~ /^path /) { path = substr(line, 6) } else if (line ~ /^content /) { content = substr(line, 9) } } if (name != "" && path != "" && content ~ /(^|,)images(,|$)/) { print type "\t" name "\t" path } } ' /etc/pve/storage.cfg 2>/dev/null | while IFS=$'\t' read -r storage_type storage path; do [[ -z "$storage" || -z "$path" ]] && continue if ! storage_cleanup_supports_path_scan "$storage_type"; then continue fi [[ -d "${path}/images" ]] || continue local file for file in "${path}"/images/[0-9]*/vm-*-state-suspend-????-??-??.raw; do [[ -e "$file" ]] || continue local relative_path="${file#${path}/images/}" [[ "$relative_path" == "$file" ]] && continue local vm_dir="${relative_path%%/*}" local file_name="${relative_path##*/}" [[ "$vm_dir" =~ ^[0-9]+$ ]] || continue is_strict_suspend_volume_name "$vm_dir" "$file_name" || continue printf '%s\t%s:%s/%s\t%s\n' "$storage" "$storage" "$vm_dir" "$file_name" "$file" done done } cleanup_orphan_suspend_artifacts() { local cleaned_count=0 local skipped_count=0 local fail_count=0 local storage local volume local file_path local vmid log_info "Scanning storages for orphan suspend-state volumes..." while IFS=$'\t' read -r storage volume file_path; do [[ -z "$volume" ]] && continue if vmid=$(get_referencing_vmid_for_vmstate "$volume"); then if vm_has_valid_suspend_state "$vmid"; then log_info "Keeping active suspend-state volume for VM $vmid (${VM_NAME[$vmid]:-unknown}): $volume" ((skipped_count++)) else log_warning "VM $vmid (${VM_NAME[$vmid]:-unknown}) references inconsistent suspend artifacts - cleaning up" if cleanup_stale_suspend_artifacts "$vmid" "during cleanup"; then ((cleaned_count++)) else ((fail_count++)) fi fi continue fi if [[ $DRY_RUN -eq 1 ]]; then echo "would remove orphan suspend-state volume: $volume" log_debug "real path: $file_path" ((cleaned_count++)) continue fi if [[ "$volume" =~ ^([^:]+):([0-9]+)/vm-([0-9]+)-state-suspend-([0-9]{4}-[0-9]{2}-[0-9]{2})\.raw$ ]]; then vmid="${BASH_REMATCH[3]}" else log_warning "Skipping suspicious suspend-state volume with unexpected name: $volume" ((skipped_count++)) continue fi VM_NAME[$vmid]="${VM_NAME[$vmid]:-unknown}" if remove_suspend_volume_by_volid "$vmid" "$volume"; then log_info "Removed orphan suspend-state volume from $storage: $volume" ((cleaned_count++)) else ((fail_count++)) fi done < <(list_suspend_artifact_files) log_success "Suspend artifact cleanup complete: $cleaned_count cleaned, $skipped_count retained, $fail_count failed" return $fail_count } unlock_vm_suspend_lock() { local vmid="$1" local context="${2:-}" local name="${VM_NAME[$vmid]:-unknown}" local unlock_output if ! vm_has_suspend_lock "$vmid"; then return 0 fi if [[ $DRY_RUN -eq 1 ]]; then if [[ -n "$context" ]]; then echo "would remove stale suspend lock for VM $vmid ($name) $context" else echo "would remove stale suspend lock for VM $vmid ($name)" fi return 0 fi unlock_output=$(qm unlock "$vmid" 2>&1) if [[ $? -eq 0 ]]; then if [[ -n "$context" ]]; then log_info "Removed stale suspend lock for VM $vmid ($name) $context" else log_info "Removed stale suspend lock for VM $vmid ($name)" fi return 0 fi if maybe_relax_quorum "$unlock_output"; then unlock_output=$(qm unlock "$vmid" 2>&1) if [[ $? -eq 0 ]]; then if [[ -n "$context" ]]; then log_info "Removed stale suspend lock for VM $vmid ($name) $context after quorum recovery" else log_info "Removed stale suspend lock for VM $vmid ($name) after quorum recovery" fi return 0 fi fi if [[ -n "$context" ]]; then log_warning "VM $vmid ($name) has a stale suspend lock $context but it could not be removed: $unlock_output" else log_warning "VM $vmid ($name) has a stale suspend lock but it could not be removed: $unlock_output" fi return 1 } unlock_vm_if_needed() { unlock_vm_suspend_lock "$1" "while VM is running" } # Quorum-sensitive operations (qm suspend/start/resume) may fail during # cluster-wide maintenance when pmxcfs becomes read-only. In that case, relax # expected votes once and retry the failed operation. maybe_relax_quorum() { local cmd_output="$1" # Already attempted in this run. if [[ $QUORUM_RELAXED -eq 1 ]]; then return 1 fi if echo "$cmd_output" | grep -qiE "cluster not ready - no quorum|/etc/pve/.+\\.conf\\.tmp.+(Permission denied|Device or resource busy)"; then log_warning "Detected quorum-related write failure in /etc/pve - attempting temporary 'pvecm expected 1'" if pvecm expected 1 >/dev/null 2>&1; then QUORUM_RELAXED=1 log_warning "Applied 'pvecm expected 1' for this maintenance cycle; retrying operation" return 0 fi log_error "Failed to apply 'pvecm expected 1' after quorum-related error" fi return 1 } # Suspend a VM to disk suspend_vm_to_disk() { local vmid="$1" local name="${VM_NAME[$vmid]:-unknown}" local qm_output local stale_path local retry_output local stale_retry_path if [[ $DRY_RUN -eq 1 ]]; then echo "would suspend VM $vmid ($name) to disk" return 0 fi log_info "Suspending VM $vmid ($name) to disk..." qm_output=$(qm suspend "$vmid" --todisk 1 2>&1) if [[ $? -eq 0 ]]; then log_success "VM $vmid ($name) suspended to disk" return 0 fi # Recover from stale suspend image left from a previous interrupted suspend. # Proxmox can emit either: # - "stale saved state disk image ('...raw' already exists)" # - "disk image '...raw' already exists" stale_path=$( echo "$qm_output" | sed -n \ -e "s/.*stale saved state[[:space:]]*disk image ('\\([^']*\\)' already exists).*/\\1/p" \ -e "s/.*disk image '\\([^']*\\)' already exists.*/\\1/p" | head -n 1 ) if [[ -n "$stale_path" && "$stale_path" =~ /vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ && -f "$stale_path" ]]; then log_warning "VM $vmid ($name) has stale suspend image: $stale_path - removing and retrying once" if rm -f -- "$stale_path"; then retry_output=$(qm suspend "$vmid" --todisk 1 2>&1) if [[ $? -eq 0 ]]; then log_success "VM $vmid ($name) suspended to disk (after stale image cleanup)" return 0 fi if maybe_relax_quorum "$retry_output"; then retry_output=$(qm suspend "$vmid" --todisk 1 2>&1) if [[ $? -eq 0 ]]; then log_success "VM $vmid ($name) suspended to disk (after stale image cleanup + quorum recovery)" return 0 fi stale_retry_path=$( echo "$retry_output" | sed -n \ -e "s/.*stale saved state[[:space:]]*disk image ('\\([^']*\\)' already exists).*/\\1/p" \ -e "s/.*disk image '\\([^']*\\)' already exists.*/\\1/p" | head -n 1 ) if [[ -n "$stale_retry_path" && "$stale_retry_path" =~ /vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ && -f "$stale_retry_path" ]]; then log_warning "VM $vmid ($name) retry left stale suspend image: $stale_retry_path - removing and retrying once more" if rm -f -- "$stale_retry_path"; then retry_output=$(qm suspend "$vmid" --todisk 1 2>&1) if [[ $? -eq 0 ]]; then log_success "VM $vmid ($name) suspended to disk (after stale image cleanup + quorum recovery + retry)" return 0 fi fi fi fi log_error "Failed to suspend VM $vmid ($name) after stale image cleanup: $retry_output" return 1 fi log_error "Failed to remove stale suspend image for VM $vmid ($name): $stale_path" return 1 fi if maybe_relax_quorum "$qm_output"; then retry_output=$(qm suspend "$vmid" --todisk 1 2>&1) if [[ $? -eq 0 ]]; then log_success "VM $vmid ($name) suspended to disk (after quorum recovery)" return 0 fi stale_retry_path=$( echo "$retry_output" | sed -n \ -e "s/.*stale saved state[[:space:]]*disk image ('\\([^']*\\)' already exists).*/\\1/p" \ -e "s/.*disk image '\\([^']*\\)' already exists.*/\\1/p" | head -n 1 ) if [[ -n "$stale_retry_path" && "$stale_retry_path" =~ /vm-${vmid}-state-suspend-[0-9]{4}-[0-9]{2}-[0-9]{2}\.raw$ && -f "$stale_retry_path" ]]; then log_warning "VM $vmid ($name) quorum retry hit stale suspend image: $stale_retry_path - removing and retrying once more" if rm -f -- "$stale_retry_path"; then retry_output=$(qm suspend "$vmid" --todisk 1 2>&1) if [[ $? -eq 0 ]]; then log_success "VM $vmid ($name) suspended to disk (after quorum recovery + stale retry)" return 0 fi fi fi log_error "Failed to suspend VM $vmid ($name) after quorum recovery: $retry_output" return 1 fi log_error "Failed to suspend VM $vmid ($name) to disk: $qm_output" return 1 } # Resume a VM from disk suspend resume_vm() { local vmid="$1" local name="${VM_NAME[$vmid]:-unknown}" local qm_output local current_status if [[ $DRY_RUN -eq 1 ]]; then echo "would resume VM $vmid ($name)" return 0 fi log_info "Resuming VM $vmid ($name)..." qm_output=$(qm resume "$vmid" 2>&1) if [[ $? -eq 0 ]]; then unlock_vm_if_needed "$vmid" log_success "VM $vmid ($name) resumed successfully" return 0 fi if maybe_relax_quorum "$qm_output"; then qm_output=$(qm resume "$vmid" 2>&1) if [[ $? -eq 0 ]]; then unlock_vm_if_needed "$vmid" log_success "VM $vmid ($name) resumed successfully (after quorum recovery)" return 0 fi current_status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') if [[ "$current_status" == "running" ]]; then unlock_vm_if_needed "$vmid" log_warning "VM $vmid ($name) is running despite resume error after quorum recovery - treating as resumed" return 2 fi log_error "Failed to resume VM $vmid ($name) after quorum recovery: $qm_output" return 1 fi if echo "$qm_output" | grep -qi "already running"; then unlock_vm_if_needed "$vmid" log_warning "VM $vmid ($name) is already running - treating as resumed" return 2 fi current_status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') if [[ "$current_status" == "running" ]]; then unlock_vm_if_needed "$vmid" log_warning "VM $vmid ($name) is running despite resume error - treating as resumed" return 2 fi log_error "Failed to resume VM $vmid ($name): $qm_output" return 1 } # Graceful shutdown a CT shutdown_ct() { local ctid="$1" local name="${CT_NAME[$ctid]:-unknown}" if [[ $DRY_RUN -eq 1 ]]; then echo "would shutdown CT $ctid ($name)" return 0 fi log_info "Shutting down CT $ctid ($name)..." if pct shutdown "$ctid" --timeout 120; then log_success "CT $ctid ($name) shut down gracefully" return 0 else log_error "Failed to shutdown CT $ctid ($name)" return 1 fi } # Start a CT start_ct() { local ctid="$1" local name="${CT_NAME[$ctid]:-unknown}" local pct_output if [[ $DRY_RUN -eq 1 ]]; then echo "would start CT $ctid ($name)" return 0 fi log_info "Starting CT $ctid ($name)..." pct_output=$(pct start "$ctid" 2>&1) if [[ $? -eq 0 ]]; then log_success "CT $ctid ($name) started successfully" return 0 fi if maybe_relax_quorum "$pct_output"; then pct_output=$(pct start "$ctid" 2>&1) if [[ $? -eq 0 ]]; then log_success "CT $ctid ($name) started successfully (after quorum recovery)" return 0 fi if [[ "$(pct status "$ctid" 2>/dev/null | awk '{print $2}')" == "running" ]]; then log_warning "CT $ctid ($name) is running despite start error after quorum recovery - treating as started" return 2 fi log_error "Failed to start CT $ctid ($name) after quorum recovery: $pct_output" return 1 fi if echo "$pct_output" | grep -qi "already running"; then log_warning "CT $ctid ($name) is already running - treating as started" return 2 fi if [[ "$(pct status "$ctid" 2>/dev/null | awk '{print $2}')" == "running" ]]; then log_warning "CT $ctid ($name) is running despite start error - treating as started" return 2 fi log_error "Failed to start CT $ctid ($name): $pct_output" return 1 } # Save state to JSON file # Usage: save_state vm_resume_array vm_suspended_array ct_start_array save_state() { local -n to_resume_ref=$1 local -n was_suspended_ref=$2 local -n ct_to_start_ref=$3 local existing_state_json="" local existing_to_resume=() local existing_was_suspended=() local existing_ct_to_start=() local final_to_resume=() local final_was_suspended=() local final_ct_to_start=() local vmid local volume local suspend_date local -A existing_vm_volume=() local -A existing_vm_date=() local -A current_vm_volume=() local -A current_vm_date=() if [[ $DRY_RUN -eq 1 ]]; then echo "would save state to $STATE_FILE" echo " to_resume (VMs): ${to_resume_ref[*]}" echo " was_suspended (VMs): ${was_suspended_ref[*]}" echo " ct_to_start (CTs): ${ct_to_start_ref[*]}" return 0 fi if existing_state_json=$(load_state 2>/dev/null); then mapfile -t existing_to_resume < <(echo "$existing_state_json" | jq -r '.to_resume[]?' 2>/dev/null) mapfile -t existing_was_suspended < <(echo "$existing_state_json" | jq -r '.was_suspended[]?' 2>/dev/null) mapfile -t existing_ct_to_start < <(echo "$existing_state_json" | jq -r '.ct_to_start[]?' 2>/dev/null) while IFS=$'\t' read -r vmid volume suspend_date; do [[ -z "$vmid" ]] && continue existing_vm_volume[$vmid]="$volume" existing_vm_date[$vmid]="$suspend_date" done < <( echo "$existing_state_json" | jq -r ' (.vm_details // {}) | to_entries[] | [.key, (.value.suspend_volume // ""), (.value.suspend_file_date // "")] | @tsv ' 2>/dev/null ) fi refresh_vm_artifact_metadata for vmid in "${to_resume_ref[@]}"; do append_unique final_to_resume "$vmid" volume="${VM_VMSTATE[$vmid]:-}" suspend_date=$(extract_suspend_file_date "$vmid" "$volume") current_vm_volume[$vmid]="$volume" current_vm_date[$vmid]="$suspend_date" done for vmid in "${existing_to_resume[@]}"; do append_unique final_to_resume "$vmid" done for vmid in "${existing_was_suspended[@]}"; do if ! array_contains "$vmid" "${final_to_resume[@]}"; then append_unique final_was_suspended "$vmid" fi done for vmid in "${was_suspended_ref[@]}"; do if array_contains "$vmid" "${final_to_resume[@]}"; then volume="${VM_VMSTATE[$vmid]:-}" if [[ -n "$volume" ]]; then current_vm_volume[$vmid]="$volume" current_vm_date[$vmid]="$(extract_suspend_file_date "$vmid" "$volume")" fi continue fi append_unique final_was_suspended "$vmid" volume="${VM_VMSTATE[$vmid]:-}" suspend_date=$(extract_suspend_file_date "$vmid" "$volume") current_vm_volume[$vmid]="$volume" current_vm_date[$vmid]="$suspend_date" done for vmid in "${final_to_resume[@]}"; do remove_value final_was_suspended "$vmid" done for vmid in "${existing_ct_to_start[@]}"; do append_unique final_ct_to_start "$vmid" done for vmid in "${ct_to_start_ref[@]}"; do append_unique final_ct_to_start "$vmid" done # Create JSON arrays (handle empty arrays properly) local to_resume_json="[]" local was_suspended_json="[]" local ct_to_start_json="[]" local vm_details_json="{}" if [[ ${#final_to_resume[@]} -gt 0 ]]; then to_resume_json=$(printf '%s\n' "${final_to_resume[@]}" | jq -R . | jq -s .) fi if [[ ${#final_was_suspended[@]} -gt 0 ]]; then was_suspended_json=$(printf '%s\n' "${final_was_suspended[@]}" | jq -R . | jq -s .) fi if [[ ${#final_ct_to_start[@]} -gt 0 ]]; then ct_to_start_json=$(printf '%s\n' "${final_ct_to_start[@]}" | jq -R . | jq -s .) fi for vmid in "${final_to_resume[@]}"; do volume="${current_vm_volume[$vmid]:-${existing_vm_volume[$vmid]:-}}" suspend_date="${current_vm_date[$vmid]:-${existing_vm_date[$vmid]:-}}" vm_details_json=$( jq \ --arg vmid "$vmid" \ --arg mode "to_resume" \ --arg volume "$volume" \ --arg suspend_date "$suspend_date" \ ' .[$vmid] = { mode: $mode, suspend_volume: $volume, suspend_file_date: $suspend_date } ' <<<"$vm_details_json" ) done for vmid in "${final_was_suspended[@]}"; do volume="${current_vm_volume[$vmid]:-${existing_vm_volume[$vmid]:-}}" suspend_date="${current_vm_date[$vmid]:-${existing_vm_date[$vmid]:-}}" vm_details_json=$( jq \ --arg vmid "$vmid" \ --arg mode "was_suspended" \ --arg volume "$volume" \ --arg suspend_date "$suspend_date" \ ' .[$vmid] = { mode: $mode, suspend_volume: $volume, suspend_file_date: $suspend_date } ' <<<"$vm_details_json" ) done cat > "$STATE_FILE" </dev/null)) local was_suspended=($(echo "$state_json" | jq -r '.was_suspended[]' 2>/dev/null)) local ct_to_start=($(echo "$state_json" | jq -r '.ct_to_start[]' 2>/dev/null)) local saved_timestamp=$(echo "$state_json" | jq -r '.timestamp' 2>/dev/null) local -A saved_vm_volume=() local -A saved_vm_date=() local saved_volume local current_volume while IFS=$'\t' read -r vmid saved_volume saved_date; do [[ -z "$vmid" ]] && continue saved_vm_volume[$vmid]="$saved_volume" saved_vm_date[$vmid]="$saved_date" done < <( echo "$state_json" | jq -r ' (.vm_details // {}) | to_entries[] | [.key, (.value.suspend_volume // ""), (.value.suspend_file_date // "")] | @tsv ' 2>/dev/null ) log_info "State file from: $saved_timestamp" local resume_count=0 local skip_count=0 local fail_count=0 # --- Resume QEMU VMs --- # Log warnings for VMs that won't be resumed for vmid in "${was_suspended[@]}"; do local name="${VM_NAME[$vmid]:-unknown}" log_warning "VM $vmid ($name) was already suspended before maintenance - NOT auto-resuming" ((skip_count++)) done # Resume VMs that should be resumed for vmid in "${to_resume[@]}"; do local name="${VM_NAME[$vmid]:-unknown}" # Verify VM still exists and has suspend lock if [[ ! -f "/etc/pve/qemu-server/${vmid}.conf" ]]; then log_error "VM $vmid config not found - skipping" ((fail_count++)) continue fi if [[ -z "${VM_HAS_LOCK[$vmid]}" ]]; then log_warning "VM $vmid ($name) no longer has suspend lock - may have been manually resumed" ((skip_count++)) continue fi saved_volume="${saved_vm_volume[$vmid]:-}" current_volume="${VM_VMSTATE[$vmid]:-}" if [[ -n "$saved_volume" && "$current_volume" != "$saved_volume" ]]; then log_warning "VM $vmid ($name) suspend volume changed since state file (${saved_vm_date[$vmid]:-unknown date}): saved=$saved_volume current=${current_volume:-none} - skipping auto-resume" ((skip_count++)) continue fi resume_vm "$vmid" case $? in 0) ((resume_count++)) ;; 2) ((skip_count++)) ;; *) ((fail_count++)) ;; esac done # --- Start LXC Containers --- for ctid in "${ct_to_start[@]}"; do local name="${CT_NAME[$ctid]:-unknown}" # Verify CT still exists if [[ ! -f "/etc/pve/lxc/${ctid}.conf" ]]; then log_error "CT $ctid config not found - skipping" ((fail_count++)) continue fi # Check if already running (someone started it manually) if [[ "${CT_STATUS[$ctid]}" == "running" ]]; then log_warning "CT $ctid ($name) is already running - skipping" ((skip_count++)) continue fi start_ct "$ctid" case $? in 0) ((resume_count++)) ;; 2) ((skip_count++)) ;; *) ((fail_count++)) ;; esac done # Clear state file only on full success; keep it for retry if any failures. if [[ $fail_count -eq 0 ]]; then clear_state else log_warning "Resume/start encountered failures - keeping state file for retry" fi # Summary log_success "Resume/start complete: $resume_count restored, $skip_count skipped, $fail_count failed" return $fail_count } # Acquire lock to prevent concurrent runs acquire_lock() { if [[ $DRY_RUN -eq 1 ]]; then return 0 fi if [[ -f "$LOCK_FILE" ]]; then local pid=$(cat "$LOCK_FILE" 2>/dev/null) if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then log_error "Another instance is running (PID $pid)" exit 1 fi # Stale lock file rm -f "$LOCK_FILE" fi echo $$ > "$LOCK_FILE" trap "rm -f '$LOCK_FILE'" EXIT } # Parse command line COMMAND="" while [[ $# -gt 0 ]]; do case "$1" in suspend|resume|cleanup) COMMAND="$1" shift ;; -n|--dry-run) DRY_RUN=1 shift ;; -v|--verbose) ((VERBOSE++)) shift ;; -vv) VERBOSE=2 shift ;; -h|--help) usage exit 0 ;; *) echo "Unknown option: $1" >&2 usage exit 1 ;; esac done if [[ -z "$COMMAND" ]]; then echo "Error: No command specified" >&2 usage exit 1 fi # Ensure state directory exists mkdir -p "$STATE_DIR" # Migrate state from the legacy location used by older installs. migrate_legacy_state_if_needed # Acquire lock acquire_lock # Execute command case "$COMMAND" in suspend) do_suspend exit $? ;; resume) do_resume exit $? ;; cleanup) do_cleanup exit $? ;; esac