#!/bin/bash # # Homelab Health Check & Recovery Script # Run this to check status and bring services online # # Usage: ./health-check.sh [--fix] # Without --fix: Read-only health check # With --fix: Attempt to start stopped services and fix issues # set -e # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # Config SSH_PASS="GrilledCh33s3#" PVE_IP="10.10.10.120" PVE2_IP="10.10.10.102" SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5" FIX_MODE=false if [[ "$1" == "--fix" ]]; then FIX_MODE=true echo -e "${YELLOW}Running in FIX mode - will attempt to start stopped services${NC}" echo "" fi # Helper functions ssh_pve() { sshpass -p "$SSH_PASS" ssh $SSH_OPTS root@$PVE_IP "$@" 2>/dev/null } ssh_pve2() { sshpass -p "$SSH_PASS" ssh $SSH_OPTS root@$PVE2_IP "$@" 2>/dev/null } print_status() { if [[ "$2" == "ok" ]]; then echo -e " ${GREEN}✓${NC} $1" elif [[ "$2" == "warn" ]]; then echo -e " ${YELLOW}!${NC} $1" else echo -e " ${RED}✗${NC} $1" fi } # Check if sshpass is installed if ! command -v sshpass &> /dev/null; then echo -e "${RED}Error: sshpass is not installed${NC}" echo "Install with: brew install hudochenkov/sshpass/sshpass" exit 1 fi echo "================================" echo " HOMELAB HEALTH CHECK" echo " $(date '+%Y-%m-%d %H:%M:%S')" echo "================================" echo "" # ============================================ # PVE (Primary Server) # ============================================ echo "--- PVE (10.10.10.120) ---" # Check connectivity if ssh_pve "echo ok" > /dev/null 2>&1; then print_status "PVE Reachable" "ok" else print_status "PVE Unreachable" "fail" echo "" echo "--- PVE2 (10.10.10.102) ---" if ssh_pve2 "echo ok" > /dev/null 2>&1; then print_status "PVE2 Reachable" "ok" else print_status "PVE2 Unreachable" "fail" fi exit 1 fi # Check cluster quorum QUORUM=$(ssh_pve "pvecm status 2>&1 | grep 'Quorate:' | awk '{print \$2}'" || echo "Unknown") if [[ "$QUORUM" == "Yes" ]]; then print_status "Cluster Quorum: $QUORUM" "ok" else print_status "Cluster Quorum: $QUORUM" "fail" fi # Check CPU temp TEMP=$(ssh_pve 'for f in /sys/class/hwmon/hwmon*/temp*_input; do label=$(cat ${f%_input}_label 2>/dev/null); if [ "$label" = "Tctl" ]; then echo $(($(cat $f)/1000)); fi; done') if [[ -n "$TEMP" ]]; then if [[ "$TEMP" -lt 85 ]]; then print_status "CPU Temp: ${TEMP}°C" "ok" elif [[ "$TEMP" -lt 90 ]]; then print_status "CPU Temp: ${TEMP}°C (warm)" "warn" else print_status "CPU Temp: ${TEMP}°C (HOT!)" "fail" fi fi # Check ZFS pools ZFS_STATUS=$(ssh_pve "zpool status -x" || echo "Unknown") if [[ "$ZFS_STATUS" == "all pools are healthy" ]]; then print_status "ZFS Pools: Healthy" "ok" else print_status "ZFS Pools: $ZFS_STATUS" "fail" fi # Check VMs echo "" echo " VMs:" CRITICAL_VMS="100 101 110 206" # TrueNAS, Saltbox, HomeAssistant, Docker-host STOPPED_VMS="" TRUENAS_ZFS_SUSPENDED=false while IFS= read -r line; do VMID=$(echo "$line" | awk '{print $1}') NAME=$(echo "$line" | awk '{print $2}') STATUS=$(echo "$line" | awk '{print $3}') if [[ "$STATUS" == "running" ]]; then print_status "$VMID $NAME: $STATUS" "ok" else print_status "$VMID $NAME: $STATUS" "fail" if [[ " $CRITICAL_VMS " =~ " $VMID " ]]; then STOPPED_VMS="$STOPPED_VMS $VMID" fi fi done < <(ssh_pve "qm list" | tail -n +2) # Check TrueNAS ZFS (VM 100) if running if ssh_pve "qm status 100" 2>/dev/null | grep -q running; then echo "" echo " TrueNAS ZFS:" TRUENAS_ZFS=$(ssh_pve 'qm guest exec 100 -- bash -c "zpool list -H -o name,health vault 2>/dev/null"' 2>/dev/null | grep -o '"out-data"[^}]*' | sed 's/"out-data" : "//' | tr -d '\\n"' || echo "Unknown") if [[ "$TRUENAS_ZFS" == *"ONLINE"* ]]; then print_status "vault pool: ONLINE" "ok" elif [[ "$TRUENAS_ZFS" == *"SUSPENDED"* ]]; then print_status "vault pool: SUSPENDED (needs zpool clear)" "fail" TRUENAS_ZFS_SUSPENDED=true elif [[ "$TRUENAS_ZFS" == *"DEGRADED"* ]]; then print_status "vault pool: DEGRADED" "warn" else print_status "vault pool: $TRUENAS_ZFS" "fail" fi fi # Check Containers echo "" echo " Containers:" CRITICAL_CTS="200 202" # PiHole, Traefik STOPPED_CTS="" while IFS= read -r line; do CTID=$(echo "$line" | awk '{print $1}') STATUS=$(echo "$line" | awk '{print $2}') NAME=$(echo "$line" | awk '{print $4}') if [[ "$STATUS" == "running" ]]; then print_status "$CTID $NAME: $STATUS" "ok" else print_status "$CTID $NAME: $STATUS" "fail" if [[ " $CRITICAL_CTS " =~ " $CTID " ]]; then STOPPED_CTS="$STOPPED_CTS $CTID" fi fi done < <(ssh_pve "pct list" | tail -n +2) # ============================================ # PVE2 (Secondary Server) # ============================================ echo "" echo "--- PVE2 (10.10.10.102) ---" if ssh_pve2 "echo ok" > /dev/null 2>&1; then print_status "PVE2 Reachable" "ok" # Check CPU temp TEMP2=$(ssh_pve2 'for f in /sys/class/hwmon/hwmon*/temp*_input; do label=$(cat ${f%_input}_label 2>/dev/null); if [ "$label" = "Tctl" ]; then echo $(($(cat $f)/1000)); fi; done') if [[ -n "$TEMP2" ]]; then if [[ "$TEMP2" -lt 85 ]]; then print_status "CPU Temp: ${TEMP2}°C" "ok" elif [[ "$TEMP2" -lt 90 ]]; then print_status "CPU Temp: ${TEMP2}°C (warm)" "warn" else print_status "CPU Temp: ${TEMP2}°C (HOT!)" "fail" fi fi # Check VMs echo "" echo " VMs:" while IFS= read -r line; do VMID=$(echo "$line" | awk '{print $1}') NAME=$(echo "$line" | awk '{print $2}') STATUS=$(echo "$line" | awk '{print $3}') if [[ "$STATUS" == "running" ]]; then print_status "$VMID $NAME: $STATUS" "ok" else print_status "$VMID $NAME: $STATUS" "fail" fi done < <(ssh_pve2 "qm list" | tail -n +2) else print_status "PVE2 Unreachable" "fail" fi # ============================================ # FIX MODE - Start stopped services # ============================================ if $FIX_MODE && [[ -n "$STOPPED_VMS" || -n "$STOPPED_CTS" || "$TRUENAS_ZFS_SUSPENDED" == "true" ]]; then echo "" echo "================================" echo " RECOVERY MODE" echo "================================" # Fix TrueNAS ZFS SUSPENDED state first (critical for mounts) if [[ "$TRUENAS_ZFS_SUSPENDED" == "true" ]]; then echo "" echo "Clearing TrueNAS ZFS pool errors..." ZFS_CLEAR_RESULT=$(ssh_pve 'qm guest exec 100 -- bash -c "zpool clear vault 2>&1 && zpool list -H -o health vault"' 2>/dev/null | grep -o '"out-data"[^}]*' | sed 's/"out-data" : "//' | tr -d '\\n"' || echo "FAILED") if [[ "$ZFS_CLEAR_RESULT" == *"ONLINE"* ]]; then print_status "vault pool recovered: ONLINE" "ok" else print_status "vault pool recovery failed: $ZFS_CLEAR_RESULT" "fail" fi sleep 5 # Give ZFS time to stabilize fi # Start TrueNAS first (it provides storage) if [[ " $STOPPED_VMS " =~ " 100 " ]]; then echo "" echo "Starting TrueNAS (VM 100) first..." ssh_pve "qm start 100" && print_status "TrueNAS started" "ok" || print_status "Failed to start TrueNAS" "fail" echo "Waiting 60s for TrueNAS to boot..." sleep 60 fi # Start other VMs for VMID in $STOPPED_VMS; do if [[ "$VMID" != "100" ]]; then NAME=$(ssh_pve "qm config $VMID | grep '^name:' | awk '{print \$2}'") echo "Starting VM $VMID ($NAME)..." ssh_pve "qm start $VMID" && print_status "$NAME started" "ok" || print_status "Failed to start $NAME" "fail" sleep 5 fi done # Start containers for CTID in $STOPPED_CTS; do NAME=$(ssh_pve "pct config $CTID | grep '^hostname:' | awk '{print \$2}'") echo "Starting CT $CTID ($NAME)..." ssh_pve "pct start $CTID" && print_status "$NAME started" "ok" || print_status "Failed to start $NAME" "fail" sleep 3 done # Mount TrueNAS shares on Saltbox if Saltbox is running if ssh_pve "qm status 101" 2>/dev/null | grep -q running; then echo "" echo "Checking TrueNAS mounts on Saltbox..." sleep 10 # Give services time to start MOUNT_STATUS=$(ssh_pve 'qm guest exec 101 -- bash -c "mount | grep -c Media"' 2>/dev/null | grep -o '"out-data"[^}]*' | grep -o '[0-9]' || echo "0") if [[ "$MOUNT_STATUS" == "0" ]]; then echo "Mounting TrueNAS shares..." ssh_pve 'qm guest exec 101 -- bash -c "mount /mnt/local/Media; mount /mnt/local/downloads"' 2>/dev/null print_status "TrueNAS mounts attempted" "ok" # Restart Immich echo "Restarting Immich..." ssh_pve 'qm guest exec 101 -- bash -c "docker restart immich"' 2>/dev/null print_status "Immich restarted" "ok" else print_status "TrueNAS mounts already present" "ok" fi fi fi # ============================================ # Summary # ============================================ echo "" echo "================================" echo " SUMMARY" echo "================================" ISSUES=0 if [[ -n "$STOPPED_VMS" ]] && ! $FIX_MODE; then echo -e "${YELLOW}Stopped critical VMs:${NC}$STOPPED_VMS" ISSUES=$((ISSUES + 1)) fi if [[ -n "$STOPPED_CTS" ]] && ! $FIX_MODE; then echo -e "${YELLOW}Stopped critical containers:${NC}$STOPPED_CTS" ISSUES=$((ISSUES + 1)) fi if [[ "$TRUENAS_ZFS_SUSPENDED" == "true" ]] && ! $FIX_MODE; then echo -e "${RED}TrueNAS ZFS pool SUSPENDED!${NC} SMB mounts will fail." ISSUES=$((ISSUES + 1)) fi if [[ "$ISSUES" -eq 0 ]]; then echo -e "${GREEN}All critical services healthy!${NC}" else echo "" echo -e "Run ${YELLOW}./health-check.sh --fix${NC} to attempt recovery" fi echo "" echo "Done: $(date '+%Y-%m-%d %H:%M:%S')"