Initial commit: Homelab infrastructure documentation
- CLAUDE.md: Main homelab assistant context and instructions - IP-ASSIGNMENTS.md: Complete IP address assignments - NETWORK.md: Network bridges, VLANs, and configuration - EMC-ENCLOSURE.md: EMC storage enclosure documentation - SYNCTHING.md: Syncthing setup and device list - SHELL-ALIASES.md: ZSH aliases for Claude Code sessions - HOMEASSISTANT.md: Home Assistant API and automations - INFRASTRUCTURE.md: Server hardware and power management - configs/: Shared shell configurations - scripts/: Utility scripts - mcp-central/: MCP server configuration 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
318
scripts/health-check.sh
Executable file
318
scripts/health-check.sh
Executable file
@@ -0,0 +1,318 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Homelab Health Check & Recovery Script
|
||||
# Run this to check status and bring services online
|
||||
#
|
||||
# Usage: ./health-check.sh [--fix]
|
||||
# Without --fix: Read-only health check
|
||||
# With --fix: Attempt to start stopped services and fix issues
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Config
|
||||
SSH_PASS="GrilledCh33s3#"
|
||||
PVE_IP="10.10.10.120"
|
||||
PVE2_IP="10.10.10.102"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5"
|
||||
|
||||
FIX_MODE=false
|
||||
if [[ "$1" == "--fix" ]]; then
|
||||
FIX_MODE=true
|
||||
echo -e "${YELLOW}Running in FIX mode - will attempt to start stopped services${NC}"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Helper functions
|
||||
ssh_pve() {
|
||||
sshpass -p "$SSH_PASS" ssh $SSH_OPTS root@$PVE_IP "$@" 2>/dev/null
|
||||
}
|
||||
|
||||
ssh_pve2() {
|
||||
sshpass -p "$SSH_PASS" ssh $SSH_OPTS root@$PVE2_IP "$@" 2>/dev/null
|
||||
}
|
||||
|
||||
print_status() {
|
||||
if [[ "$2" == "ok" ]]; then
|
||||
echo -e " ${GREEN}✓${NC} $1"
|
||||
elif [[ "$2" == "warn" ]]; then
|
||||
echo -e " ${YELLOW}!${NC} $1"
|
||||
else
|
||||
echo -e " ${RED}✗${NC} $1"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if sshpass is installed
|
||||
if ! command -v sshpass &> /dev/null; then
|
||||
echo -e "${RED}Error: sshpass is not installed${NC}"
|
||||
echo "Install with: brew install hudochenkov/sshpass/sshpass"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "================================"
|
||||
echo " HOMELAB HEALTH CHECK"
|
||||
echo " $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo "================================"
|
||||
echo ""
|
||||
|
||||
# ============================================
|
||||
# PVE (Primary Server)
|
||||
# ============================================
|
||||
echo "--- PVE (10.10.10.120) ---"
|
||||
|
||||
# Check connectivity
|
||||
if ssh_pve "echo ok" > /dev/null 2>&1; then
|
||||
print_status "PVE Reachable" "ok"
|
||||
else
|
||||
print_status "PVE Unreachable" "fail"
|
||||
echo ""
|
||||
echo "--- PVE2 (10.10.10.102) ---"
|
||||
if ssh_pve2 "echo ok" > /dev/null 2>&1; then
|
||||
print_status "PVE2 Reachable" "ok"
|
||||
else
|
||||
print_status "PVE2 Unreachable" "fail"
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check cluster quorum
|
||||
QUORUM=$(ssh_pve "pvecm status 2>&1 | grep 'Quorate:' | awk '{print \$2}'" || echo "Unknown")
|
||||
if [[ "$QUORUM" == "Yes" ]]; then
|
||||
print_status "Cluster Quorum: $QUORUM" "ok"
|
||||
else
|
||||
print_status "Cluster Quorum: $QUORUM" "fail"
|
||||
fi
|
||||
|
||||
# Check CPU temp
|
||||
TEMP=$(ssh_pve 'for f in /sys/class/hwmon/hwmon*/temp*_input; do label=$(cat ${f%_input}_label 2>/dev/null); if [ "$label" = "Tctl" ]; then echo $(($(cat $f)/1000)); fi; done')
|
||||
if [[ -n "$TEMP" ]]; then
|
||||
if [[ "$TEMP" -lt 85 ]]; then
|
||||
print_status "CPU Temp: ${TEMP}°C" "ok"
|
||||
elif [[ "$TEMP" -lt 90 ]]; then
|
||||
print_status "CPU Temp: ${TEMP}°C (warm)" "warn"
|
||||
else
|
||||
print_status "CPU Temp: ${TEMP}°C (HOT!)" "fail"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check ZFS pools
|
||||
ZFS_STATUS=$(ssh_pve "zpool status -x" || echo "Unknown")
|
||||
if [[ "$ZFS_STATUS" == "all pools are healthy" ]]; then
|
||||
print_status "ZFS Pools: Healthy" "ok"
|
||||
else
|
||||
print_status "ZFS Pools: $ZFS_STATUS" "fail"
|
||||
fi
|
||||
|
||||
# Check VMs
|
||||
echo ""
|
||||
echo " VMs:"
|
||||
CRITICAL_VMS="100 101 110 206" # TrueNAS, Saltbox, HomeAssistant, Docker-host
|
||||
STOPPED_VMS=""
|
||||
TRUENAS_ZFS_SUSPENDED=false
|
||||
|
||||
while IFS= read -r line; do
|
||||
VMID=$(echo "$line" | awk '{print $1}')
|
||||
NAME=$(echo "$line" | awk '{print $2}')
|
||||
STATUS=$(echo "$line" | awk '{print $3}')
|
||||
|
||||
if [[ "$STATUS" == "running" ]]; then
|
||||
print_status "$VMID $NAME: $STATUS" "ok"
|
||||
else
|
||||
print_status "$VMID $NAME: $STATUS" "fail"
|
||||
if [[ " $CRITICAL_VMS " =~ " $VMID " ]]; then
|
||||
STOPPED_VMS="$STOPPED_VMS $VMID"
|
||||
fi
|
||||
fi
|
||||
done < <(ssh_pve "qm list" | tail -n +2)
|
||||
|
||||
# Check TrueNAS ZFS (VM 100) if running
|
||||
if ssh_pve "qm status 100" 2>/dev/null | grep -q running; then
|
||||
echo ""
|
||||
echo " TrueNAS ZFS:"
|
||||
TRUENAS_ZFS=$(ssh_pve 'qm guest exec 100 -- bash -c "zpool list -H -o name,health vault 2>/dev/null"' 2>/dev/null | grep -o '"out-data"[^}]*' | sed 's/"out-data" : "//' | tr -d '\\n"' || echo "Unknown")
|
||||
|
||||
if [[ "$TRUENAS_ZFS" == *"ONLINE"* ]]; then
|
||||
print_status "vault pool: ONLINE" "ok"
|
||||
elif [[ "$TRUENAS_ZFS" == *"SUSPENDED"* ]]; then
|
||||
print_status "vault pool: SUSPENDED (needs zpool clear)" "fail"
|
||||
TRUENAS_ZFS_SUSPENDED=true
|
||||
elif [[ "$TRUENAS_ZFS" == *"DEGRADED"* ]]; then
|
||||
print_status "vault pool: DEGRADED" "warn"
|
||||
else
|
||||
print_status "vault pool: $TRUENAS_ZFS" "fail"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check Containers
|
||||
echo ""
|
||||
echo " Containers:"
|
||||
CRITICAL_CTS="200 202" # PiHole, Traefik
|
||||
STOPPED_CTS=""
|
||||
|
||||
while IFS= read -r line; do
|
||||
CTID=$(echo "$line" | awk '{print $1}')
|
||||
STATUS=$(echo "$line" | awk '{print $2}')
|
||||
NAME=$(echo "$line" | awk '{print $4}')
|
||||
|
||||
if [[ "$STATUS" == "running" ]]; then
|
||||
print_status "$CTID $NAME: $STATUS" "ok"
|
||||
else
|
||||
print_status "$CTID $NAME: $STATUS" "fail"
|
||||
if [[ " $CRITICAL_CTS " =~ " $CTID " ]]; then
|
||||
STOPPED_CTS="$STOPPED_CTS $CTID"
|
||||
fi
|
||||
fi
|
||||
done < <(ssh_pve "pct list" | tail -n +2)
|
||||
|
||||
# ============================================
|
||||
# PVE2 (Secondary Server)
|
||||
# ============================================
|
||||
echo ""
|
||||
echo "--- PVE2 (10.10.10.102) ---"
|
||||
|
||||
if ssh_pve2 "echo ok" > /dev/null 2>&1; then
|
||||
print_status "PVE2 Reachable" "ok"
|
||||
|
||||
# Check CPU temp
|
||||
TEMP2=$(ssh_pve2 'for f in /sys/class/hwmon/hwmon*/temp*_input; do label=$(cat ${f%_input}_label 2>/dev/null); if [ "$label" = "Tctl" ]; then echo $(($(cat $f)/1000)); fi; done')
|
||||
if [[ -n "$TEMP2" ]]; then
|
||||
if [[ "$TEMP2" -lt 85 ]]; then
|
||||
print_status "CPU Temp: ${TEMP2}°C" "ok"
|
||||
elif [[ "$TEMP2" -lt 90 ]]; then
|
||||
print_status "CPU Temp: ${TEMP2}°C (warm)" "warn"
|
||||
else
|
||||
print_status "CPU Temp: ${TEMP2}°C (HOT!)" "fail"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check VMs
|
||||
echo ""
|
||||
echo " VMs:"
|
||||
while IFS= read -r line; do
|
||||
VMID=$(echo "$line" | awk '{print $1}')
|
||||
NAME=$(echo "$line" | awk '{print $2}')
|
||||
STATUS=$(echo "$line" | awk '{print $3}')
|
||||
|
||||
if [[ "$STATUS" == "running" ]]; then
|
||||
print_status "$VMID $NAME: $STATUS" "ok"
|
||||
else
|
||||
print_status "$VMID $NAME: $STATUS" "fail"
|
||||
fi
|
||||
done < <(ssh_pve2 "qm list" | tail -n +2)
|
||||
else
|
||||
print_status "PVE2 Unreachable" "fail"
|
||||
fi
|
||||
|
||||
# ============================================
|
||||
# FIX MODE - Start stopped services
|
||||
# ============================================
|
||||
if $FIX_MODE && [[ -n "$STOPPED_VMS" || -n "$STOPPED_CTS" || "$TRUENAS_ZFS_SUSPENDED" == "true" ]]; then
|
||||
echo ""
|
||||
echo "================================"
|
||||
echo " RECOVERY MODE"
|
||||
echo "================================"
|
||||
|
||||
# Fix TrueNAS ZFS SUSPENDED state first (critical for mounts)
|
||||
if [[ "$TRUENAS_ZFS_SUSPENDED" == "true" ]]; then
|
||||
echo ""
|
||||
echo "Clearing TrueNAS ZFS pool errors..."
|
||||
ZFS_CLEAR_RESULT=$(ssh_pve 'qm guest exec 100 -- bash -c "zpool clear vault 2>&1 && zpool list -H -o health vault"' 2>/dev/null | grep -o '"out-data"[^}]*' | sed 's/"out-data" : "//' | tr -d '\\n"' || echo "FAILED")
|
||||
|
||||
if [[ "$ZFS_CLEAR_RESULT" == *"ONLINE"* ]]; then
|
||||
print_status "vault pool recovered: ONLINE" "ok"
|
||||
else
|
||||
print_status "vault pool recovery failed: $ZFS_CLEAR_RESULT" "fail"
|
||||
fi
|
||||
sleep 5 # Give ZFS time to stabilize
|
||||
fi
|
||||
|
||||
# Start TrueNAS first (it provides storage)
|
||||
if [[ " $STOPPED_VMS " =~ " 100 " ]]; then
|
||||
echo ""
|
||||
echo "Starting TrueNAS (VM 100) first..."
|
||||
ssh_pve "qm start 100" && print_status "TrueNAS started" "ok" || print_status "Failed to start TrueNAS" "fail"
|
||||
echo "Waiting 60s for TrueNAS to boot..."
|
||||
sleep 60
|
||||
fi
|
||||
|
||||
# Start other VMs
|
||||
for VMID in $STOPPED_VMS; do
|
||||
if [[ "$VMID" != "100" ]]; then
|
||||
NAME=$(ssh_pve "qm config $VMID | grep '^name:' | awk '{print \$2}'")
|
||||
echo "Starting VM $VMID ($NAME)..."
|
||||
ssh_pve "qm start $VMID" && print_status "$NAME started" "ok" || print_status "Failed to start $NAME" "fail"
|
||||
sleep 5
|
||||
fi
|
||||
done
|
||||
|
||||
# Start containers
|
||||
for CTID in $STOPPED_CTS; do
|
||||
NAME=$(ssh_pve "pct config $CTID | grep '^hostname:' | awk '{print \$2}'")
|
||||
echo "Starting CT $CTID ($NAME)..."
|
||||
ssh_pve "pct start $CTID" && print_status "$NAME started" "ok" || print_status "Failed to start $NAME" "fail"
|
||||
sleep 3
|
||||
done
|
||||
|
||||
# Mount TrueNAS shares on Saltbox if Saltbox is running
|
||||
if ssh_pve "qm status 101" 2>/dev/null | grep -q running; then
|
||||
echo ""
|
||||
echo "Checking TrueNAS mounts on Saltbox..."
|
||||
sleep 10 # Give services time to start
|
||||
|
||||
MOUNT_STATUS=$(ssh_pve 'qm guest exec 101 -- bash -c "mount | grep -c Media"' 2>/dev/null | grep -o '"out-data"[^}]*' | grep -o '[0-9]' || echo "0")
|
||||
|
||||
if [[ "$MOUNT_STATUS" == "0" ]]; then
|
||||
echo "Mounting TrueNAS shares..."
|
||||
ssh_pve 'qm guest exec 101 -- bash -c "mount /mnt/local/Media; mount /mnt/local/downloads"' 2>/dev/null
|
||||
print_status "TrueNAS mounts attempted" "ok"
|
||||
|
||||
# Restart Immich
|
||||
echo "Restarting Immich..."
|
||||
ssh_pve 'qm guest exec 101 -- bash -c "docker restart immich"' 2>/dev/null
|
||||
print_status "Immich restarted" "ok"
|
||||
else
|
||||
print_status "TrueNAS mounts already present" "ok"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# ============================================
|
||||
# Summary
|
||||
# ============================================
|
||||
echo ""
|
||||
echo "================================"
|
||||
echo " SUMMARY"
|
||||
echo "================================"
|
||||
|
||||
ISSUES=0
|
||||
|
||||
if [[ -n "$STOPPED_VMS" ]] && ! $FIX_MODE; then
|
||||
echo -e "${YELLOW}Stopped critical VMs:${NC}$STOPPED_VMS"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
fi
|
||||
|
||||
if [[ -n "$STOPPED_CTS" ]] && ! $FIX_MODE; then
|
||||
echo -e "${YELLOW}Stopped critical containers:${NC}$STOPPED_CTS"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
fi
|
||||
|
||||
if [[ "$TRUENAS_ZFS_SUSPENDED" == "true" ]] && ! $FIX_MODE; then
|
||||
echo -e "${RED}TrueNAS ZFS pool SUSPENDED!${NC} SMB mounts will fail."
|
||||
ISSUES=$((ISSUES + 1))
|
||||
fi
|
||||
|
||||
if [[ "$ISSUES" -eq 0 ]]; then
|
||||
echo -e "${GREEN}All critical services healthy!${NC}"
|
||||
else
|
||||
echo ""
|
||||
echo -e "Run ${YELLOW}./health-check.sh --fix${NC} to attempt recovery"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Done: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
Reference in New Issue
Block a user