diff --git a/.stfolder/syncthing-folder-8be0b5.txt b/.stfolder/syncthing-folder-8be0b5.txt new file mode 100644 index 0000000..4ef719a --- /dev/null +++ b/.stfolder/syncthing-folder-8be0b5.txt @@ -0,0 +1,5 @@ +# This directory is a Syncthing folder marker. +# Do not delete. + +folderID: homelab +created: 2025-12-23T00:39:52-05:00 diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c86066..15239f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,12 +36,12 @@ Investigated UPS power limit issues across both Proxmox servers. [Unit] Description=Disable KSM (Kernel Same-page Merging) After=multi-user.target - + [Service] Type=oneshot ExecStart=/bin/sh -c "echo 0 > /sys/kernel/mm/ksm/run" RemainAfterExit=yes - + [Install] WantedBy=multi-user.target ``` @@ -108,12 +108,12 @@ curl -X POST -H "X-API-Key: xxx" http://localhost:20910/rest/system/restart [Unit] Description=Set CPU governor to powersave with balance_power EPP After=multi-user.target - + [Service] Type=oneshot ExecStart=/bin/bash -c 'for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo powersave > "$gov"; done; for epp in /sys/devices/system/cpu/cpu*/cpufreq/energy_performance_preference; do echo balance_power > "$epp"; done' RemainAfterExit=yes - + [Install] WantedBy=multi-user.target ``` @@ -127,12 +127,12 @@ curl -X POST -H "X-API-Key: xxx" http://localhost:20910/rest/system/restart [Unit] Description=Set CPU governor to schedutil for power savings After=multi-user.target - + [Service] Type=oneshot ExecStart=/bin/bash -c 'for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo schedutil > "$gov"; done' RemainAfterExit=yes - + [Install] WantedBy=multi-user.target ``` @@ -194,4 +194,4 @@ Not useful when: - `general_profit` is negative ### What is Memory Ballooning? -Guest-cooperative memory management. Hypervisor can request VMs to give back unused RAM. Independent from KSMD. Both are Proxmox/KVM memory optimization features but serve different purposes. +Guest-cooperative memory management. Hypervisor can request VMs to give back unused RAM. Independent from KSMD. Both are Proxmox/KVM memory optimization features but serve different purposes. diff --git a/CLAUDE.md b/CLAUDE.md index 7c805c5..2cbc24e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,6 +10,7 @@ This is your **quick reference guide** for common homelab tasks. For detailed in | Task | Documentation | Quick Command | |------|--------------|---------------| +| **Gateway issues** | [GATEWAY.md](GATEWAY.md) | `ssh ucg-fiber 'free -m'` | | **Add new public service** | [TRAEFIK.md](TRAEFIK.md) | Create Traefik config + Cloudflare DNS | | **Check UPS status** | [UPS.md](UPS.md) | `ssh pve 'upsc cyberpower@localhost'` | | **Check server temps** | [Temperature Check](#server-temperature-check) | `ssh pve 'grep Tctl ...'` | @@ -83,6 +84,7 @@ nc -zw1 10.10.10.150 22000 && echo "Windows: UP" || echo "Windows: DOWN" | Symptom | Check | Fix | Docs | |---------|-------|-----|------| +| **Network down** | `ssh ucg-fiber 'free -m'` | Check memory, watchdog reboots auto | [GATEWAY.md](GATEWAY.md) | | Device not syncing | `curl Syncthing API` | Restart Syncthing | [SYNCTHING.md](SYNCTHING.md) | | VM won't start | Storage/RAM available? | `ssh pve 'qm start VMID'` | [VMS.md](VMS.md) | | Server running hot | Check KSM, CPU processes | Disable KSM | [POWER-MANAGEMENT.md](POWER-MANAGEMENT.md) | @@ -138,8 +140,10 @@ Router (10.10.10.1) | Syncthing | MacBook | `http://127.0.0.1:8384/rest/` | `X-API-Key: qYkNdVLwy9qZZZ6MqnJr7tHX7KKdxGMJ` | | Syncthing | Phone | `https://10.10.10.54:8384/rest/` | `X-API-Key: Xxz3jDT4akUJe6psfwZsbZwG2LhfZuDM` | | Proxmox | PVE/PVE2 | `https://10.10.10.120:8006/api2/json/` | SSH key auth | +| MetaMCP | docker-host2 | `https://metamcp.htsn.io/` | Web UI login | +| n8n | docker-host2 | `http://10.10.10.207:5678/api/v1/` | `X-N8N-API-KEY` (see [N8N.md](N8N.md)) | -**See**: [SYNCTHING.md](SYNCTHING.md), [HOMEASSISTANT.md](HOMEASSISTANT.md) for more APIs +**See**: [SYNCTHING.md](SYNCTHING.md), [HOMEASSISTANT.md](HOMEASSISTANT.md), [N8N.md](N8N.md) for more APIs --- @@ -185,6 +189,8 @@ sshpass -p 'GrilledCh33s3#' ssh claude@10.10.10.150 \ | 101 | saltbox | 10.10.10.100 | Media stack (Plex) | [VMS.md](VMS.md) | | 110 | homeassistant | 10.10.10.110 | Home automation | [HOMEASSISTANT.md](HOMEASSISTANT.md) | | 202 | traefik (CT) | 10.10.10.250 | Reverse proxy | [TRAEFIK.md](TRAEFIK.md) | +| 206 | docker-host | 10.10.10.206 | Monitoring stack (Grafana/Prometheus) | [VMS.md](VMS.md) | +| 302 | docker-host2 | 10.10.10.207 | MetaMCP, n8n, automation | [VMS.md](VMS.md) | **Complete inventory**: [VMS.md](VMS.md) | **IP assignments**: [IP-ASSIGNMENTS.md](IP-ASSIGNMENTS.md) @@ -239,6 +245,7 @@ ssh pve 'qm guest exec VMID -- bash -c "COMMAND"' ### Infrastructure - [README.md](README.md) - Start here +- [GATEWAY.md](GATEWAY.md) - UniFi gateway, monitoring services - [VMS.md](VMS.md) - VM/CT inventory - [STORAGE.md](STORAGE.md) - ZFS pools, shares - [NETWORK.md](NETWORK.md) - Bridges, VLANs, Tailscale @@ -250,6 +257,7 @@ ssh pve 'qm guest exec VMID -- bash -c "COMMAND"' - [HOMEASSISTANT.md](HOMEASSISTANT.md) - Home automation - [SYNCTHING.md](SYNCTHING.md) - File sync - [EMC-ENCLOSURE.md](EMC-ENCLOSURE.md) - Storage enclosure +- [MONITORING.md](MONITORING.md) - System monitoring ### Operations - [SSH-ACCESS.md](SSH-ACCESS.md) - SSH keys, hosts @@ -302,6 +310,27 @@ git add -A && git commit -m "Update docs" && git push ## Recent Changes +### 2026-01-03 +- Deployed **Crafty Controller 4** on docker-host2 for Minecraft server management +- URL: https://mc.htsn.io (Web GUI) +- Minecraft Java: 10.10.10.207:25565 +- Minecraft Bedrock (Geyser): 10.10.10.207:19132/udp +- Admin: `admin` / password in `/crafty/app/config/default-creds.txt` +- World data to be migrated from Windows PC (D:\Minecraft\mcss\servers\hutworld) +- Deployed **MetaMCP** on docker-host2 (10.10.10.207) for unified MCP server management +- URL: https://metamcp.htsn.io +- Added docker-host2 to SSH config (`~/.ssh/config`) +- Updated IP-ASSIGNMENTS.md, SSH-ACCESS.md, TRAEFIK.md with docker-host2 + +### 2026-01-02 +- Created [GATEWAY.md](GATEWAY.md) - UniFi gateway documentation +- Deployed internet-watchdog service (auto-reboot on connectivity loss) +- Deployed memory-monitor service (logs memory usage every 10 min) +- Configured SSH key auth for gateway (`ucg-fiber`/`gateway` aliases) +- Disabled UniFi Connect to free ~200MB RAM +- Updated [MONITORING.md](MONITORING.md) with gateway monitoring +- Updated [SSH-ACCESS.md](SSH-ACCESS.md) with key auth for router + ### 2025-12-22 - Created comprehensive Phase 1 documentation split - New docs: README.md, BACKUP-STRATEGY.md, STORAGE.md, UPS.md, TRAEFIK.md, SSH-ACCESS.md, POWER-MANAGEMENT.md, VMS.md @@ -319,8 +348,8 @@ git add -A && git commit -m "Update docs" && git push --- -**Last Updated**: 2025-12-22 -**Documentation Status**: ✅ Phase 1 Complete +**Last Updated**: 2026-01-03 +**Documentation Status**: ✅ Phase 1 Complete + Gateway Monitoring + MetaMCP --- diff --git a/GATEWAY.md b/GATEWAY.md new file mode 100644 index 0000000..7c5d566 --- /dev/null +++ b/GATEWAY.md @@ -0,0 +1,339 @@ +# UniFi Gateway (UCG-Fiber) + +Documentation for the UniFi Cloud Gateway Fiber (10.10.10.1) - the primary network gateway and router. + +## Overview + +| Property | Value | +|----------|-------| +| **Device** | UniFi Cloud Gateway Fiber (UCG-Fiber) | +| **IP Address** | 10.10.10.1 | +| **SSH User** | root | +| **SSH Auth** | SSH key (`~/.ssh/id_ed25519`) | +| **Host Aliases** | `ucg-fiber`, `gateway` | +| **Firmware** | v4.4.9 (as of 2026-01-02) | +| **UniFi Core** | 4.4.19 | +| **RAM** | 2.9 GB (shared with UniFi apps) | + +--- + +## SSH Access + +SSH key authentication is configured. Use host aliases: + +```bash +# Quick access +ssh ucg-fiber 'hostname' +ssh gateway 'free -m' + +# Or use IP directly +ssh root@10.10.10.1 'uptime' +``` + +**Note**: SSH key may need re-deployment after firmware updates if UniFi clears authorized_keys. + +--- + +## Monitoring Services + +Two custom monitoring services run on the gateway to prevent and diagnose issues. + +### Internet Watchdog Service + +**Purpose**: Auto-reboots gateway if internet connectivity is lost for 5+ minutes + +**Location**: `/data/scripts/internet-watchdog.sh` + +**How it works**: +1. Pings 1.1.1.1, 8.8.8.8, 208.67.222.222 every 60 seconds +2. If all three fail, increments failure counter +3. After 5 consecutive failures (~5 minutes), triggers reboot +4. Logs all activity to `/var/log/internet-watchdog.log` + +**Commands**: +```bash +# Check service status +ssh ucg-fiber 'systemctl status internet-watchdog' + +# View recent logs +ssh ucg-fiber 'tail -50 /var/log/internet-watchdog.log' + +# Stop temporarily (if troubleshooting) +ssh ucg-fiber 'systemctl stop internet-watchdog' + +# Restart +ssh ucg-fiber 'systemctl restart internet-watchdog' +``` + +**Log Format**: +``` +2026-01-02 22:45:01 - Watchdog started +2026-01-02 22:46:01 - Internet check failed (1/5) +2026-01-02 22:47:01 - Internet restored after 1 failures +``` + +--- + +### Memory Monitor Service + +**Purpose**: Logs memory usage and top processes every 10 minutes for diagnostics + +**Location**: `/data/scripts/memory-monitor.sh` + +**Log File**: `/data/logs/memory-history.log` + +**How it works**: +1. Every 10 minutes, logs current memory usage (`free -m`) +2. Logs top 12 memory-consuming processes +3. Auto-rotates log when it exceeds 10MB (keeps one .old file) + +**Commands**: +```bash +# Check service status +ssh ucg-fiber 'systemctl status memory-monitor' + +# View recent memory history +ssh ucg-fiber 'tail -100 /data/logs/memory-history.log' + +# Check current memory usage +ssh ucg-fiber 'free -m' + +# See top memory consumers right now +ssh ucg-fiber 'ps -eo pid,rss,comm --sort=-rss | head -12' +``` + +**Log Format**: +``` +========== 2026-01-02 22:30:00 ========== +--- MEMORY --- + total used free shared buff/cache available +Mem: 2892 1890 102 456 899 1002 +Swap: 512 88 424 +--- TOP MEMORY PROCESSES --- + PID RSS COMMAND + 1234 327456 unifi-protect + 2345 252108 mongod + 3456 236544 java +... +``` + +--- + +## Known Memory Consumers + +| Process | Typical Memory | Purpose | +|---------|----------------|---------| +| unifi-protect | ~320 MB | Camera/NVR management | +| mongod | ~250 MB | UniFi configuration database | +| java (controller) | ~230 MB | UniFi Network controller | +| postgres | ~180 MB | PostgreSQL database | +| unifi-core | ~150 MB | UniFi OS core | +| tailscaled | ~80 MB | Tailscale VPN | + +**Total available**: ~2.9 GB +**Typical usage**: ~1.8-2.0 GB (leaves ~1 GB free) +**Warning threshold**: <500 MB free +**Critical**: <200 MB free or swap >50% used + +--- + +## Disabled Services + +The following services were disabled to reduce memory usage: + +| Service | Memory Saved | Reason Disabled | +|---------|--------------|-----------------| +| UniFi Connect | ~200 MB | Not needed (cameras use Protect) | + +To re-enable if needed: +```bash +ssh ucg-fiber 'systemctl enable unifi-connect && systemctl start unifi-connect' +``` + +--- + +## Common Issues + +### Gateway Freeze / Network Loss + +**Symptoms**: +- All devices lose internet +- Cannot ping 10.10.10.1 +- Physical reboot required + +**Root Cause**: Memory exhaustion causing soft lockup + +**Prevention**: +1. Internet watchdog auto-reboots after 5 min outage +2. Memory monitor logs help identify runaway processes +3. UniFi Connect disabled to free ~200 MB + +**Post-Incident Analysis**: +```bash +# Check memory history for spike before freeze +ssh ucg-fiber 'grep -B5 "Swap:" /data/logs/memory-history.log | tail -50' + +# Check watchdog logs +ssh ucg-fiber 'cat /var/log/internet-watchdog.log' + +# Check system logs for errors +ssh ucg-fiber 'dmesg | tail -100' +ssh ucg-fiber 'journalctl -p err --since "1 hour ago"' +``` + +--- + +### High Memory Usage + +**Check current state**: +```bash +ssh ucg-fiber 'free -m && echo "---" && ps -eo pid,rss,comm --sort=-rss | head -15' +``` + +**If swap is heavily used**: +```bash +# Check swap usage +ssh ucg-fiber 'cat /proc/swaps' + +# See what's in swap +ssh ucg-fiber 'for pid in $(ls /proc | grep -E "^[0-9]+$"); do + swap=$(grep VmSwap /proc/$pid/status 2>/dev/null | awk "{print \$2}"); + [ "$swap" -gt 10000 ] 2>/dev/null && echo "$pid: ${swap}kB - $(cat /proc/$pid/comm)"; +done | sort -t: -k2 -rn | head -10' +``` + +**Consider reboot if**: +- Available memory <200 MB +- Swap usage >300 MB +- System becoming unresponsive + +--- + +### Tailscale Issues + +**Check Tailscale status**: +```bash +ssh ucg-fiber 'tailscale status' +``` + +**Common errors and fixes**: + +| Error | Fix | +|-------|-----| +| `DNS resolution failed` | Check upstream DNS (Pi-hole at 10.10.10.10) | +| `TLS handshake failed` | Usually temporary; Tailscale auto-reconnects | +| `Not connected` | `ssh ucg-fiber 'tailscale up'` | + +--- + +## Firmware Updates + +**Check current version**: +```bash +ssh ucg-fiber 'ubnt-systool version' +``` + +**Update process**: +1. Check UniFi site for latest stable firmware +2. Download via UI or CLI +3. Schedule update during low-usage time + +**After update**: +- Verify SSH key still works +- Check custom services still running +- Verify Tailscale reconnects + +**Re-deploy SSH key if needed**: +```bash +ssh-copy-id -i ~/.ssh/id_ed25519 root@10.10.10.1 +``` + +--- + +## Service Locations + +| File | Purpose | +|------|---------| +| `/data/scripts/internet-watchdog.sh` | Watchdog script | +| `/data/scripts/memory-monitor.sh` | Memory monitor script | +| `/etc/systemd/system/internet-watchdog.service` | Watchdog systemd unit | +| `/etc/systemd/system/memory-monitor.service` | Memory monitor systemd unit | +| `/var/log/internet-watchdog.log` | Watchdog log | +| `/data/logs/memory-history.log` | Memory history log | + +**Note**: `/data/` persists across firmware updates. `/var/log/` may not. + +--- + +## Quick Reference Commands + +```bash +# System status +ssh ucg-fiber 'uptime && free -m' + +# Check both monitoring services +ssh ucg-fiber 'systemctl status internet-watchdog memory-monitor' + +# Memory history (last hour) +ssh ucg-fiber 'tail -60 /data/logs/memory-history.log' + +# Watchdog activity +ssh ucg-fiber 'tail -20 /var/log/internet-watchdog.log' + +# Network devices (ARP table) +ssh ucg-fiber 'cat /proc/net/arp' + +# Tailscale status +ssh ucg-fiber 'tailscale status' + +# System logs +ssh ucg-fiber 'journalctl -p warning --since "1 hour ago" | head -50' +``` + +--- + +## Backup Considerations + +Custom services in `/data/scripts/` persist across firmware updates but may need: +- Systemd services re-enabled after major updates +- Script permissions re-applied if wiped + +**Backup critical files**: +```bash +# Copy scripts locally for reference +scp ucg-fiber:/data/scripts/*.sh ~/Projects/homelab/data/scripts/ +``` + +--- + +## Related Documentation + +- [SSH-ACCESS.md](SSH-ACCESS.md) - SSH configuration and host aliases +- [NETWORK.md](NETWORK.md) - Network architecture +- [MONITORING.md](MONITORING.md) - Overall monitoring strategy +- [HOMEASSISTANT.md](HOMEASSISTANT.md) - Home Assistant integration + +--- + +## Incident History + +### 2025-12-27 to 2025-12-29: Gateway Freeze + +**Timeline**: +- Dec 7: Firmware update to v4.4.9 +- Dec 24: Last healthy system logs +- Dec 27-29: "No internet detected" errors in logs +- Dec 29+: Complete silence (gateway frozen) +- Jan 2: Physical reboot restored access + +**Root Cause**: Memory exhaustion causing soft lockup (no crash dump saved) + +**Resolution**: +- Deployed internet-watchdog service +- Deployed memory-monitor service +- Disabled UniFi Connect (~200 MB saved) +- Configured SSH key auth + +--- + +**Last Updated**: 2026-01-02 diff --git a/IP-ASSIGNMENTS.md b/IP-ASSIGNMENTS.md index 27a826c..f0ed04f 100644 --- a/IP-ASSIGNMENTS.md +++ b/IP-ASSIGNMENTS.md @@ -45,6 +45,7 @@ This document tracks all IP addresses in the homelab infrastructure. |------|------|------------|---------|--------| | 300 | gitea-vm | 10.10.10.220 | Git server | Running | | 301 | trading-vm | 10.10.10.221 | AI trading platform (RTX A6000) | Running | +| 302 | docker-host2 | 10.10.10.207 | Docker services (n8n, future apps) | Running | ## Workstations & Personal Devices @@ -69,6 +70,9 @@ This document tracks all IP addresses in the homelab infrastructure. | CopyParty | cp.htsn.io | 10.10.10.201:3923 | Traefik-Primary | | LMDev | lmdev.htsn.io | 10.10.10.111 | Traefik-Primary | | Excalidraw | excalidraw.htsn.io | 10.10.10.206:8080 | Traefik-Primary | +| MetaMCP | metamcp.htsn.io | 10.10.10.207:12008 | Traefik-Primary | +| n8n | n8n.htsn.io | 10.10.10.207:5678 | Traefik-Primary | +| Crafty Controller | mc.htsn.io | 10.10.10.207:8443 | Traefik-Primary | | Plex | plex.htsn.io | 10.10.10.100:32400 | Traefik-Saltbox | | Sonarr | sonarr.htsn.io | 10.10.10.100:8989 | Traefik-Saltbox | | Radarr | radarr.htsn.io | 10.10.10.100:7878 | Traefik-Saltbox | @@ -92,6 +96,7 @@ This document tracks all IP addresses in the homelab infrastructure. - .200 - TrueNAS - .201 - CopyParty - .206 - Docker-host +- .207 - Docker-host2 - .220 - Gitea - .221 - Trading VM - .250 - Traefik-Primary @@ -110,7 +115,7 @@ This document tracks all IP addresses in the homelab infrastructure. - 10.10.10.148 - 10.10.10.149 (2 IPs) - 10.10.10.151 - 10.10.10.199 (49 IPs) - 10.10.10.202 - 10.10.10.205 (4 IPs) -- 10.10.10.207 - 10.10.10.219 (13 IPs) +- 10.10.10.208 - 10.10.10.219 (12 IPs) - 10.10.10.222 - 10.10.10.249 (28 IPs) - 10.10.10.251 - 10.10.10.254 (4 IPs) @@ -123,6 +128,18 @@ This document tracks all IP addresses in the homelab infrastructure. | Portainer Agent | 9001 | Remote management from other Portainer | | Gotenberg | 3000 | PDF generation API | +## Docker Host 2 Services (10.10.10.207) - PVE2 + +| Service | Port | Purpose | +|---------|------|---------| +| MetaMCP | 12008 | MCP Aggregator/Gateway (metamcp.htsn.io) | +| n8n | 5678 | Workflow automation | +| Crafty Controller | 8443 | Minecraft server management (mc.htsn.io) | +| Minecraft Java | 25565 | Minecraft Java Edition server | +| Minecraft Bedrock | 19132/udp | Minecraft Bedrock Edition (Geyser) | +| Trading Redis | 6379 | Redis for trading platform | +| Trading TimescaleDB | 5433 | TimescaleDB for trading platform | + ## Syncthing API Endpoints | Device | IP | Port | API Key | diff --git a/MINECRAFT.md b/MINECRAFT.md new file mode 100644 index 0000000..9e6bf4d --- /dev/null +++ b/MINECRAFT.md @@ -0,0 +1,478 @@ +# Minecraft Server - Hutworld + +Minecraft server running on docker-host2 via Crafty Controller 4. + +--- + +## Quick Reference + +| Setting | Value | +|---------|-------| +| **Web GUI** | https://mc.htsn.io | +| **Game Server (Java)** | hutworld.htsn.io:25565 | +| **Game Server (Bedrock)** | hutworld.htsn.io:19132 | +| **Host** | docker-host2 (10.10.10.207) | +| **Server Type** | Paper 1.21.11 | +| **World Name** | hutworld | +| **Memory** | 2GB min / 4GB max | + +--- + +## Crafty Controller Access + +| Setting | Value | +|---------|-------| +| **URL** | https://mc.htsn.io | +| **Username** | admin | +| **Password** | See `/crafty/data/config/default-creds.txt` on docker-host2 | + +**Get password:** +```bash +ssh docker-host2 'cat ~/crafty/data/config/default-creds.txt' +``` + +--- + +## Current Status + +### Completed + +- [x] Crafty Controller 4.4.7 deployed on docker-host2 +- [x] Traefik reverse proxy configured (mc.htsn.io → 10.10.10.207:8443) +- [x] DNS A record created for hutworld.htsn.io (non-proxied, points to public IP) +- [x] Port forwarding configured via UniFi API: + - TCP/UDP 25565 → 10.10.10.207 (Java Edition) + - UDP 19132 → 10.10.10.207 (Bedrock via Geyser) +- [x] Server files transferred from Windows PC (D:\Minecraft\mcss\servers\hutworld) +- [x] Server imported into Crafty and running +- [x] Paper upgraded from 1.21.5 to 1.21.11 +- [x] Plugins updated (GSit 3.1.1, LuckPerms 5.5.22) +- [x] Orphaned plugin data cleaned up +- [x] LuckPerms database restored with original permissions +- [x] Automated backups to TrueNAS configured (every 6 hours) + +### Pending + +- [ ] Change Crafty admin password to something memorable +- [ ] Test external connectivity from outside network + +--- + +## Import Instructions + +To import the hutworld server in Crafty: + +1. Go to **Servers** → Click **+ Create New Server** +2. Select **Import Server** tab +3. Fill in: + - **Server Name:** `Hutworld` + - **Import Path:** `/crafty/import/hutworld` + - **Server JAR:** `paper.jar` + - **Min RAM:** `2048` (2GB) + - **Max RAM:** `6144` (6GB) + - **Server Port:** `25565` +4. Click **Import Server** +5. Go to server → Click **Start** + +--- + +## Server Configuration + +### World Data + +| World | Description | +|-------|-------------| +| hutworld | Main overworld | +| hutworld_nether | Nether dimension | +| hutworld_the_end | End dimension | + +### Installed Plugins + +| Plugin | Version | Purpose | +|--------|---------|---------| +| EssentialsX | 2.20.1 | Core server commands | +| EssentialsXChat | 2.20.1 | Chat formatting | +| EssentialsXSpawn | 2.20.1 | Spawn management | +| Geyser-Spigot | Latest | Bedrock Edition support | +| floodgate | Latest | Bedrock authentication | +| GSit | 3.1.1 | Sit/lay/crawl animations | +| LuckPerms | 5.5.22 | Permissions management | +| PluginPortal | 2.2.2 | Plugin management | +| Vault | 1.7.3 | Economy/permissions API | +| ViaVersion | Latest | Multi-version support | +| ViaBackwards | Latest | Older client support | +| randomtp | Latest | Random teleportation | + +**Removed plugins** (cleaned up 2026-01-03): +- GriefPrevention, Multiverse-Core, Multiverse-Portals, ProtocolLib, WorldEdit, WorldGuard (disabled/orphaned) + +--- + +## Docker Configuration + +**Location:** `~/crafty/docker-compose.yml` on docker-host2 + +```yaml +services: + crafty: + image: registry.gitlab.com/crafty-controller/crafty-4:4.4.7 + container_name: crafty + restart: unless-stopped + environment: + - TZ=America/New_York + ports: + - "8443:8443" # Web GUI (HTTPS) + - "8123:8123" # Dynmap (if used) + - "25565:25565" # Minecraft Java + - "25566:25566" # Additional server + - "19132:19132/udp" # Minecraft Bedrock (Geyser) + volumes: + - ./data/backups:/crafty/backups + - ./data/logs:/crafty/logs + - ./data/servers:/crafty/servers + - ./data/config:/crafty/app/config + - ./data/import:/crafty/import +``` + +--- + +## Traefik Configuration + +**File:** `/etc/traefik/conf.d/crafty.yaml` on CT 202 (10.10.10.250) + +```yaml +http: + routers: + crafty-secure: + entryPoints: + - websecure + rule: "Host(`mc.htsn.io`)" + service: crafty + tls: + certResolver: cloudflare + priority: 50 + + services: + crafty: + loadBalancer: + servers: + - url: "https://10.10.10.207:8443" + serversTransport: crafty-transport@file + + serversTransports: + crafty-transport: + insecureSkipVerify: true +``` + +--- + +## Port Forwarding (UniFi) + +Configured via UniFi API on UCG-Fiber (10.10.10.1): + +| Rule Name | Port | Protocol | Destination | +|-----------|------|----------|-------------| +| Minecraft Java | 25565 | TCP/UDP | 10.10.10.207:25565 | +| Minecraft Bedrock | 19132 | UDP | 10.10.10.207:19132 | + +--- + +## DNS Records (Cloudflare) + +| Record | Type | Value | Proxied | +|--------|------|-------|---------| +| mc.htsn.io | CNAME | htsn.io | Yes (for web GUI) | +| hutworld.htsn.io | A | 70.237.94.174 | No (direct for game traffic) | + +**Note:** Game traffic (25565, 19132) cannot be proxied through Cloudflare - only HTTP/HTTPS works with Cloudflare proxy. + +--- + +## LuckPerms Web Editor + +After server is running: + +1. Open Crafty console for Hutworld server +2. Run command: `/lp editor` +3. A unique URL will be generated (cloud-hosted by LuckPerms) +4. Open the URL in browser to manage permissions + +The editor is hosted by LuckPerms, so no additional port forwarding is needed. + +--- + +## Backup Configuration + +### Automated Backups to TrueNAS + +Backups run automatically every 6 hours and are stored on TrueNAS. + +| Setting | Value | +|---------|-------| +| **Destination** | TrueNAS (10.10.10.200) | +| **Path** | `/mnt/vault/users/backups/minecraft/` | +| **Frequency** | Every 6 hours (12am, 6am, 12pm, 6pm) | +| **Retention** | 14 backups (~3.5 days of history) | +| **Size** | ~2.3 GB per backup | +| **Script** | `/home/hutson/minecraft-backup.sh` on docker-host2 | +| **Log** | `/home/hutson/minecraft-backup.log` on docker-host2 | + +### Backup Script + +**Location:** `~/minecraft-backup.sh` on docker-host2 + +```bash +#!/bin/bash +# Minecraft Server Backup Script +# Backs up Crafty server data to TrueNAS + +BACKUP_SRC="$HOME/crafty/data/servers/19f604a9-f037-442d-9283-0761c73cfd60" +BACKUP_DEST="hutson@10.10.10.200:/mnt/vault/users/backups/minecraft" +DATE=$(date +%Y-%m-%d_%H%M) +BACKUP_NAME="hutworld-$DATE.tar.gz" +LOCAL_BACKUP="/tmp/$BACKUP_NAME" + +# Create compressed backup (exclude large unnecessary files) +tar -czf "$LOCAL_BACKUP" \ + --exclude="*.jar" \ + --exclude="cache" \ + --exclude="libraries" \ + --exclude=".paper-remapped" \ + -C "$HOME/crafty/data/servers" \ + 19f604a9-f037-442d-9283-0761c73cfd60 + +# Transfer to TrueNAS +sshpass -p 'GrilledCh33s3#' scp -o StrictHostKeyChecking=no "$LOCAL_BACKUP" "$BACKUP_DEST/" + +# Clean up local temp file +rm -f "$LOCAL_BACKUP" + +# Keep only last 14 backups on TrueNAS +sshpass -p 'GrilledCh33s3#' ssh -o StrictHostKeyChecking=no hutson@10.10.10.200 ' + cd /mnt/vault/users/backups/minecraft + ls -t hutworld-*.tar.gz 2>/dev/null | tail -n +15 | xargs -r rm -f +' +``` + +### Cron Schedule + +```bash +# View current schedule +ssh docker-host2 'crontab -l | grep minecraft' + +# Output: 0 */6 * * * /home/hutson/minecraft-backup.sh >> /home/hutson/minecraft-backup.log 2>&1 +``` + +### Manual Backup Commands + +```bash +# Run backup manually +ssh docker-host2 '~/minecraft-backup.sh' + +# Check backup log +ssh docker-host2 'tail -20 ~/minecraft-backup.log' + +# List backups on TrueNAS +sshpass -p 'GrilledCh33s3#' ssh -o StrictHostKeyChecking=no hutson@10.10.10.200 \ + 'ls -lh /mnt/vault/users/backups/minecraft/' +``` + +### Restore from Backup + +```bash +# 1. Stop the server in Crafty web UI + +# 2. Copy backup from TrueNAS +sshpass -p 'GrilledCh33s3#' scp -o StrictHostKeyChecking=no \ + hutson@10.10.10.200:/mnt/vault/users/backups/minecraft/hutworld-YYYY-MM-DD_HHMM.tar.gz \ + /tmp/ + +# 3. Extract to server directory (backup existing first) +ssh docker-host2 'cd ~/crafty/data/servers && \ + mv 19f604a9-f037-442d-9283-0761c73cfd60 19f604a9-f037-442d-9283-0761c73cfd60.old && \ + tar -xzf /tmp/hutworld-YYYY-MM-DD_HHMM.tar.gz' + +# 4. Start server in Crafty web UI +``` + +--- + +## Common Tasks + +### Start/Stop Server + +Via Crafty web UI at https://mc.htsn.io, or: + +```bash +# Check Crafty container status +ssh docker-host2 'docker ps | grep crafty' + +# Restart Crafty container +ssh docker-host2 'cd ~/crafty && docker compose restart' + +# View Crafty logs +ssh docker-host2 'docker logs -f crafty' +``` + +### Backup Server + +See [Backup Configuration](#backup-configuration) for full details. + +```bash +# Run backup manually +ssh docker-host2 '~/minecraft-backup.sh' + +# Check recent backups +sshpass -p 'GrilledCh33s3#' ssh -o StrictHostKeyChecking=no hutson@10.10.10.200 \ + 'ls -lht /mnt/vault/users/backups/minecraft/ | head -5' +``` + +### Update Plugins + +1. Download new plugin JAR +2. Upload via Crafty Files tab, or: +```bash +scp plugin.jar docker-host2:~/crafty/data/servers/hutworld/plugins/ +``` +3. Restart server in Crafty + +### Check Server Logs + +Via Crafty web UI (Logs tab), or: +```bash +ssh docker-host2 'tail -f ~/crafty/data/servers/hutworld/logs/latest.log' +``` + +--- + +## Troubleshooting + +### Server won't start + +```bash +# Check Crafty container logs +ssh docker-host2 'docker logs crafty --tail 50' + +# Check server logs +ssh docker-host2 'cat ~/crafty/data/servers/hutworld/logs/latest.log | tail -100' + +# Check Java version in container +ssh docker-host2 'docker exec crafty java -version' +``` + +### Can't connect externally + +1. Verify port forwarding is active: +```bash +ssh root@10.10.10.1 'iptables -t nat -L -n | grep 25565' +``` + +2. Test from external network: +```bash +nc -zv hutworld.htsn.io 25565 +``` + +3. Check if server is listening: +```bash +ssh docker-host2 'netstat -tlnp | grep 25565' +``` + +### Bedrock players can't connect + +1. Verify Geyser plugin is installed and enabled +2. Check Geyser config: `~/crafty/data/servers/hutworld/plugins/Geyser-Spigot/config.yml` +3. Ensure UDP 19132 is forwarded and not blocked + +### LuckPerms missing users/permissions + +If LuckPerms shows a fresh database (missing users like Suwan): + +1. **Check if original database exists:** +```bash +ssh docker-host2 'ls -la ~/crafty/data/import/hutworld/plugins/LuckPerms/*.db' +``` + +2. **Restore from import backup:** +```bash +# Stop server in Crafty UI first +ssh docker-host2 'cp ~/crafty/data/import/hutworld/plugins/LuckPerms/luckperms-h2-v2.mv.db \ + ~/crafty/data/servers/19f604a9-f037-442d-9283-0761c73cfd60/plugins/LuckPerms/' +``` + +3. **Or restore from TrueNAS backup:** +```bash +# List available backups +sshpass -p 'GrilledCh33s3#' ssh -o StrictHostKeyChecking=no hutson@10.10.10.200 \ + 'ls -lt /mnt/vault/users/backups/minecraft/' + +# Extract LuckPerms database from backup +sshpass -p 'GrilledCh33s3#' scp hutson@10.10.10.200:/mnt/vault/users/backups/minecraft/hutworld-YYYY-MM-DD_HHMM.tar.gz /tmp/ +tar -xzf /tmp/hutworld-*.tar.gz -C /tmp --strip-components=2 \ + '*/plugins/LuckPerms/luckperms-h2-v2.mv.db' +``` + +4. **Restart server in Crafty UI** + +--- + +## Migration History + +### 2026-01-04: Backup System + +- Configured automated backups to TrueNAS every 6 hours +- Set 14-backup retention (~3.5 days of recovery points) +- Created backup script with compression and cleanup +- Storage: `/mnt/vault/users/backups/minecraft/` + +### 2026-01-03: Server Fixes & Updates + +**Updates:** +- Upgraded Paper from 1.21.5 to 1.21.11 (build 69) +- Updated GSit from 2.3.2 to 3.1.1 +- Fixed corrupted LuckPerms JAR (re-downloaded 5.5.22) +- Restored original LuckPerms database with user permissions + +**Cleanup:** +- Removed disabled plugins: Dynmap, Graves +- Removed orphaned data folders: GriefPreventionData, SilkSpawners_v2, Graves, ViaRewind + +**Fixes:** +- Fixed memory allocation (was attempting 2TB, set to 2GB min / 4GB max) +- Fixed file permissions for Docker container access + +### 2026-01-03: Initial Migration + +**Source:** Windows PC (10.10.10.150) - D:\Minecraft\mcss\servers\hutworld + +**Steps completed:** +1. Compressed hutworld folder on Windows (2.4GB zip) +2. Transferred via SCP to docker-host2 +3. Unzipped to ~/crafty/data/import/hutworld +4. Downloaded Paper 1.21.5 JAR (later upgraded to 1.21.11) +5. Imported server into Crafty Controller +6. Configured port forwarding (updated existing 25565 rule, added 19132) +7. Created DNS record for hutworld.htsn.io + +**Original MCSS config preserved:** `mcss_server_config.json` + +--- + +## Related Documentation + +- [IP Assignments](IP-ASSIGNMENTS.md) - Network configuration +- [Traefik](TRAEFIK.md) - Reverse proxy setup +- [VMs](VMS.md) - docker-host2 details +- [Gateway](GATEWAY.md) - UCG-Fiber configuration + +--- + +## Resources + +- [Crafty Controller Docs](https://docs.craftycontrol.com/) +- [Paper MC](https://papermc.io/) +- [Geyser MC](https://geysermc.org/) +- [LuckPerms](https://luckperms.net/) + +--- + +**Last Updated:** 2026-01-04 diff --git a/MONITORING.md b/MONITORING.md index fad8286..a1eb47f 100644 --- a/MONITORING.md +++ b/MONITORING.md @@ -6,17 +6,18 @@ Documentation for system monitoring, health checks, and alerting across the home | Component | Monitored? | Method | Alerts | Notes | |-----------|------------|--------|--------|-------| +| **Gateway** | ✅ Yes | Custom services | ✅ Auto-reboot | Internet watchdog + memory monitor | | **UPS** | ✅ Yes | NUT + Home Assistant | ❌ No | Battery, load, runtime tracked | | **Syncthing** | ✅ Partial | API (manual checks) | ❌ No | Connection status available | | **Server temps** | ✅ Partial | Manual checks | ❌ No | Via `sensors` command | | **VM status** | ✅ Partial | Proxmox UI | ❌ No | Manual monitoring | | **ZFS health** | ❌ No | Manual `zpool status` | ❌ No | No automated checks | | **Disk health (SMART)** | ❌ No | Manual `smartctl` | ❌ No | No automated checks | -| **Network** | ❌ No | - | ❌ No | No uptime monitoring | +| **Network** | ✅ Partial | Gateway watchdog | ✅ Auto-reboot | Connectivity check every 60s | | **Services** | ❌ No | - | ❌ No | No health checks | | **Backups** | ❌ No | - | ❌ No | No verification | -**Overall Status**: ⚠️ **MINIMAL** - Most monitoring is manual, no automated alerts +**Overall Status**: ⚠️ **PARTIAL** - Gateway monitoring active, most else is manual --- @@ -51,6 +52,41 @@ ssh pve 'upsc cyberpower@localhost | grep -E "battery.charge:|battery.runtime:|u --- +### Gateway Monitoring + +**Status**: ✅ **Active with auto-recovery** + +Two custom systemd services monitor the UCG-Fiber gateway (10.10.10.1): + +**1. Internet Watchdog** (`internet-watchdog.service`) +- Pings external DNS (1.1.1.1, 8.8.8.8, 208.67.222.222) every 60 seconds +- Auto-reboots gateway after 5 consecutive failures (~5 minutes) +- Logs to `/var/log/internet-watchdog.log` + +**2. Memory Monitor** (`memory-monitor.service`) +- Logs memory usage and top processes every 10 minutes +- Logs to `/data/logs/memory-history.log` +- Auto-rotates when log exceeds 10MB + +**Quick Commands**: +```bash +# Check service status +ssh ucg-fiber 'systemctl status internet-watchdog memory-monitor' + +# View watchdog activity +ssh ucg-fiber 'tail -20 /var/log/internet-watchdog.log' + +# View memory history +ssh ucg-fiber 'tail -100 /data/logs/memory-history.log' + +# Current memory usage +ssh ucg-fiber 'free -m && ps -eo pid,rss,comm --sort=-rss | head -12' +``` + +**See**: [GATEWAY.md](GATEWAY.md) + +--- + ### Syncthing Monitoring **Status**: ⚠️ **Partial** - API available, no automated monitoring @@ -534,6 +570,7 @@ done' ## Related Documentation +- [GATEWAY.md](GATEWAY.md) - Gateway monitoring and troubleshooting - [UPS.md](UPS.md) - UPS monitoring details - [STORAGE.md](STORAGE.md) - ZFS health checks - [SERVICES.md](SERVICES.md) - Service inventory @@ -542,5 +579,5 @@ done' --- -**Last Updated**: 2025-12-22 -**Status**: ⚠️ **Minimal monitoring currently in place - implementation needed** +**Last Updated**: 2026-01-02 +**Status**: ⚠️ **Partial monitoring - Gateway active, other systems need implementation** diff --git a/N8N-INTEGRATIONS.md b/N8N-INTEGRATIONS.md new file mode 100644 index 0000000..2349d1e --- /dev/null +++ b/N8N-INTEGRATIONS.md @@ -0,0 +1,382 @@ +# n8n Homelab Integrations - Quick Start Guide + +n8n is running on your homelab network (10.10.10.207) and can access all local services. This guide sets up useful automations. + +--- + +## Network Access Verified + +n8n can connect to: +- ✅ **Home Assistant** (10.10.10.110:8123) +- ✅ **Prometheus** (10.10.10.206:9090) +- ✅ **Grafana** (10.10.10.206:3001) +- ✅ **Syncthing** (10.10.10.200:8384) +- ✅ **PiHole** (10.10.10.10) +- ✅ **Gitea** (10.10.10.220:3000) +- ✅ **Proxmox** (10.10.10.120:8006, 10.10.10.102:8006) +- ✅ **TrueNAS** (10.10.10.200) +- ✅ **All external APIs** (via internet) + +--- + +## Initial Setup (First-Time) + +1. Open **https://n8n.htsn.io** +2. Complete the setup wizard: + - **Owner Email:** hutson@htsn.io + - **Owner Name:** Hutson + - **Password:** (choose secure password) +3. Skip data sharing (optional) + +--- + +## Credentials to Add in n8n + +Go to **Settings → Credentials** and add: + +### 1. Home Assistant + +| Field | Value | +|-------|-------| +| **Credential Type** | Home Assistant API | +| **Host** | `http://10.10.10.110:8123` | +| **Access Token** | (get from Home Assistant) | + +**Get Token:** Home Assistant → Profile → Long-Lived Access Tokens → Create Token + +--- + +### 2. Prometheus + +| Field | Value | +|-------|-------| +| **Credential Type** | HTTP Request (Generic) | +| **URL** | `http://10.10.10.206:9090` | +| **Authentication** | None | + +--- + +### 3. Grafana + +| Field | Value | +|-------|-------| +| **Credential Type** | Grafana API | +| **URL** | `http://10.10.10.206:3001` | +| **API Key** | (create in Grafana) | + +**Get API Key:** Grafana → Administration → Service Accounts → Create → Add Token + +--- + +### 4. Syncthing + +| Field | Value | +|-------|-------| +| **Credential Type** | HTTP Request (Generic) | +| **URL** | `http://10.10.10.200:8384` | +| **Header Name** | `X-API-Key` | +| **Header Value** | `VFJ7XZPJoWvkYj6fKzpQxc9u3XC8KUBs` | + +--- + +### 5. Telegram Bot + +| Field | Value | +|-------|-------| +| **Credential Type** | Telegram API | +| **Access Token** | `8450212653:AAHoVBlNUuA0vtrVPMNUfSgJh_gmFMxlrBg` | + +**Your Chat ID:** `1004084736` + +--- + +### 6. Proxmox + +| Field | Value | +|-------|-------| +| **Credential Type** | HTTP Request (Generic) | +| **URL** | `http://10.10.10.120:8006` | +| **Authentication** | API Token | +| **Token** | (use monitoring@pve token if needed) | + +--- + +## Starter Workflows + +### Workflow 1: Homelab Health Check (Every Hour) + +**Nodes:** +1. **Schedule Trigger** (every hour) +2. **HTTP Request** → Prometheus query for down hosts + - URL: `http://10.10.10.206:9090/api/v1/query` + - Query param: `query=up{job=~"node.*"} == 0` +3. **If** → Check if any hosts are down +4. **Telegram** → Send alert if hosts down + +**PromQL Query:** +``` +up{job=~"node.*"} == 0 +``` + +--- + +### Workflow 2: Daily Backup Status + +**Nodes:** +1. **Schedule Trigger** (8am daily) +2. **HTTP Request** → Query Syncthing sync status + - URL: `http://10.10.10.200:8384/rest/db/status?folder=backup` + - Header: `X-API-Key: VFJ7XZPJoWvkYj6fKzpQxc9u3XC8KUBs` +3. **Function** → Check if folder is syncing +4. **Telegram** → Send daily status report + +--- + +### Workflow 3: High CPU Alert + +**Nodes:** +1. **Schedule Trigger** (every 5 minutes) +2. **HTTP Request** → Prometheus CPU query + - URL: `http://10.10.10.206:9090/api/v1/query` + - Query: `100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)` +3. **If** → CPU > 90% +4. **Telegram** → Send alert + +--- + +### Workflow 4: UPS Power Event + +**Webhook Trigger Setup:** +1. Create webhook trigger in n8n +2. Get webhook URL: `https://n8n.htsn.io/webhook/ups-alert` +3. Configure NUT to call webhook on power events + +**Nodes:** +1. **Webhook Trigger** → Receive UPS event +2. **Switch** → Route by event type (on battery, low battery, online) +3. **Telegram** → Send appropriate alert + +--- + +### Workflow 5: Gitea → Deploy on Push + +**Nodes:** +1. **Webhook Trigger** → Gitea push event +2. **If** → Check if branch is `main` +3. **SSH** → Connect to target server +4. **Execute Command** → `git pull && docker-compose up -d` +5. **Telegram** → Notify deployment complete + +--- + +### Workflow 6: Syncthing Folder Behind Alert + +**Nodes:** +1. **Schedule Trigger** (every 30 minutes) +2. **HTTP Request** → Get all folder statuses + - URL: `http://10.10.10.200:8384/rest/stats/folder` +3. **Function** → Check if any folder has errors or is significantly behind +4. **If** → Errors found +5. **Telegram** → Alert with folder name and status + +--- + +### Workflow 7: Grafana Alert Forwarder + +**Purpose:** Forward Grafana alerts to Telegram + +**Nodes:** +1. **Webhook Trigger** → Grafana webhook +2. **Function** → Parse alert data +3. **Telegram** → Format and send alert + +**Grafana Setup:** +- Contact Point → Add webhook: `https://n8n.htsn.io/webhook/grafana-alerts` + +--- + +### Workflow 8: Daily Homelab Summary + +**Nodes:** +1. **Schedule Trigger** (9am daily) +2. **Multiple HTTP Requests in parallel:** + - Prometheus: System uptime + - Prometheus: Average CPU usage (24h) + - Prometheus: Disk usage + - Syncthing: Sync status (all folders) + - PiHole: Queries blocked (24h) +3. **Function** → Format data as summary +4. **Telegram** → Send daily report + +**Example Output:** +``` +🏠 Homelab Daily Summary + +✅ All systems operational +⏱️ Uptime: 14 days +📊 Avg CPU: 12% +💾 Disk: 45% used +🔄 Syncthing: All folders in sync +🛡️ PiHole: 2,341 queries blocked + +Last updated: 2025-12-27 09:00 +``` + +--- + +### Workflow 9: VM State Change Monitor + +**Nodes:** +1. **Schedule Trigger** (every 1 minute) +2. **HTTP Request** → Query Proxmox API for VM list +3. **Function** → Compare with previous state (use Set node) +4. **If** → VM state changed +5. **Telegram** → Notify VM started/stopped + +--- + +### Workflow 10: Internet Speed Test Alert + +**Nodes:** +1. **Schedule Trigger** (every 6 hours) +2. **HTTP Request** → Prometheus speedtest exporter +3. **If** → Download speed < 500 Mbps +4. **Telegram** → Alert about slow internet + +--- + +## Advanced Integration Ideas + +### Home Assistant Automations +- Turn on lights when server room temperature > 80°F +- Trigger workflows from HA button press +- Send sensor data to external services + +### Proxmox Automation +- Auto-snapshot VMs before updates +- Clone VMs for testing +- Monitor resource usage and rebalance + +### Media Management +- Notify when new Plex content added +- Auto-organize downloads +- Send weekly watch statistics + +### Backup Monitoring +- Verify all Syncthing folders synced +- Alert on ZFS scrub errors +- Monitor snapshot ages + +### Security +- Alert on failed SSH attempts (from logs) +- Monitor SSL certificate expiration +- Track unusual network traffic patterns + +--- + +## n8n Best Practices + +1. **Error Handling:** Always add error workflows to catch failures +2. **Rate Limiting:** Don't query APIs too frequently +3. **Credentials:** Never hardcode - always use credential store +4. **Testing:** Use manual trigger during development +5. **Logging:** Add Set nodes to track workflow state +6. **Backups:** Export workflows regularly (Settings → Export) + +--- + +## Useful PromQL Queries for n8n + +**CPU Usage:** +```promql +100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) +``` + +**Memory Usage:** +```promql +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 +``` + +**Disk Usage:** +```promql +(node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 +``` + +**Hosts Down:** +```promql +up{job=~"node.*"} == 0 +``` + +**Syncthing Disconnected:** +```promql +up{job=~"syncthing.*"} == 0 +``` + +--- + +## Webhook URLs + +After creating webhooks in n8n, you'll get URLs like: +- `https://n8n.htsn.io/webhook/your-webhook-name` + +These can be called from: +- Grafana alerts +- Home Assistant automations +- Gitea webhooks +- Custom scripts +- UPS monitoring (NUT) + +--- + +## Testing Credentials + +Test each credential after adding: +1. Create simple workflow with manual trigger +2. Add HTTP Request node with credential +3. Execute and check response +4. Verify data returned correctly + +--- + +## Troubleshooting + +**Can't reach local service:** +- Verify service IP and port +- Check if service requires HTTPS +- Test with `curl` from docker-host2 first + +**Webhook not triggering:** +- Check n8n is accessible: `curl https://n8n.htsn.io/webhook/test` +- Verify webhook URL in external service +- Check n8n execution logs + +**Workflow fails silently:** +- Enable "Execute on Error" workflow +- Check workflow execution list +- Add Function nodes to log data + +**API authentication fails:** +- Verify credential is saved +- Check API token hasn't expired +- Test with curl manually first + +--- + +## Next Steps + +1. **Add Credentials** - Start with Telegram and Prometheus +2. **Create Test Workflow** - Simple hourly health check +3. **Test Telegram** - Verify messages arrive +4. **Build Gradually** - Add one workflow at a time +5. **Export Backups** - Save workflows regularly + +--- + +## Resources + +- **n8n Docs:** https://docs.n8n.io +- **Community Workflows:** https://n8n.io/workflows +- **Your n8n:** https://n8n.htsn.io +- **Your API Docs:** [N8N.md](N8N.md) + +**Last Updated:** 2025-12-27 diff --git a/N8N.md b/N8N.md new file mode 100644 index 0000000..1b015c6 --- /dev/null +++ b/N8N.md @@ -0,0 +1,308 @@ +# n8n - Workflow Automation + +n8n is an extendable workflow automation tool deployed on docker-host2 for automating tasks across your homelab and external services. + +--- + +## Quick Reference + +| Setting | Value | +|---------|-------| +| **URL** | https://n8n.htsn.io | +| **Local IP** | 10.10.10.207:5678 | +| **Server** | docker-host2 (PVE2 VMID 302) | +| **Database** | PostgreSQL (containerized) | +| **API Endpoint** | http://10.10.10.207:5678/api/v1/ | + +--- + +## Claude Code Integration (MCP) + +### n8n-MCP Server + +The n8n-MCP server gives Claude Code deep knowledge of all 545+ n8n nodes, enabling it to build complete workflows from natural language descriptions. + +**Installation:** Already configured in `~/Library/Application Support/Claude/claude_desktop_config.json` + +```json +{ + "mcpServers": { + "n8n-nodes": { + "command": "npx", + "args": ["-y", "@czlonkowski/n8n-mcp"] + } + } +} +``` + +**What This Enables:** +- ✅ Build n8n workflows from natural language +- ✅ Get detailed help with node parameters and options +- ✅ Best practices for n8n node usage +- ✅ Debug workflow issues with full node context + +**Example Prompts:** +``` +"Create an n8n workflow to monitor Prometheus and send Telegram alerts" +"Build a workflow that triggers when Syncthing has errors" +"What's the best n8n node to parse JSON responses?" +``` + +**How It Works:** +- MCP server provides offline documentation for all n8n nodes +- No connection to your n8n instance required +- Claude builds workflows that you can then import into https://n8n.htsn.io + +**Resources:** +- [n8n-MCP GitHub](https://github.com/czlonkowski/n8n-mcp) +- [MCP Documentation](https://docs.n8n.io/advanced-ai/accessing-n8n-mcp-server/) + +--- + +## API Access + +### API Key + +``` +X-N8N-API-KEY: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI3NTdiMDA5YS1hMjM2LTQ5MzUtODkwNS0xZDY1MjYzZWE2OWYiLCJpc3MiOiJuOG4iLCJhdWQiOiJwdWJsaWMtYXBpIiwiaWF0IjoxNzY2ODEwMzA3fQ.RIZAbpDa7LiUPWk48qOscJ9-d9gRAA0afMDX_V3oSVo +``` + +### API Examples + +**List Workflows:** +```bash +curl -H "X-N8N-API-KEY: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI3NTdiMDA5YS1hMjM2LTQ5MzUtODkwNS0xZDY1MjYzZWE2OWYiLCJpc3MiOiJuOG4iLCJhdWQiOiJwdWJsaWMtYXBpIiwiaWF0IjoxNzY2ODEwMzA3fQ.RIZAbpDa7LiUPWk48qOscJ9-d9gRAA0afMDX_V3oSVo" \ + http://10.10.10.207:5678/api/v1/workflows +``` + +**Get Workflow by ID:** +```bash +curl -H "X-N8N-API-KEY: YOUR_API_KEY" \ + http://10.10.10.207:5678/api/v1/workflows/{id} +``` + +**Trigger Workflow:** +```bash +curl -X POST \ + -H "X-N8N-API-KEY: YOUR_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"data": {"key": "value"}}' \ + http://10.10.10.207:5678/api/v1/workflows/{id}/execute +``` + +**API Documentation:** https://docs.n8n.io/api/ + +--- + +## Deployment Details + +### Docker Compose + +**Location:** `/opt/n8n/docker-compose.yml` on docker-host2 + +**Services:** +- `n8n` - Main application (port 5678) +- `postgres` - Database backend + +**Volumes:** +- `n8n_data` - Workflow data, credentials, settings +- `postgres_data` - Database storage + +### Environment Configuration + +```yaml +N8N_HOST: n8n.htsn.io +N8N_PORT: 5678 +N8N_PROTOCOL: https +NODE_ENV: production +WEBHOOK_URL: https://n8n.htsn.io/ +GENERIC_TIMEZONE: America/Los_Angeles +DB_TYPE: postgresdb +DB_POSTGRESDB_HOST: postgres +DB_POSTGRESDB_DATABASE: n8n +DB_POSTGRESDB_USER: n8n +DB_POSTGRESDB_PASSWORD: n8n_secure_password_2024 +``` + +### Resource Limits + +- **Memory**: 512MB-1GB (soft/hard) +- **CPU**: Shared (4 vCPUs on host) + +--- + +## Common Tasks + +### Restart n8n + +```bash +ssh docker-host2 'cd /opt/n8n && docker compose restart n8n' +``` + +### View Logs + +```bash +ssh docker-host2 'docker logs -f n8n' +``` + +### Backup Workflows + +Workflows are stored in PostgreSQL. To backup: + +```bash +ssh docker-host2 'docker exec n8n-postgres pg_dump -U n8n n8n > /tmp/n8n-backup-$(date +%Y%m%d).sql' +``` + +### Update n8n + +```bash +ssh docker-host2 'cd /opt/n8n && docker compose pull n8n && docker compose up -d n8n' +``` + +--- + +## Traefik Configuration + +**File:** `/etc/traefik/conf.d/n8n.yaml` on CT 202 + +```yaml +http: + routers: + n8n-secure: + entryPoints: + - websecure + rule: "Host(`n8n.htsn.io`)" + service: n8n + tls: + certResolver: cloudflare + priority: 50 + + n8n-redirect: + entryPoints: + - web + rule: "Host(`n8n.htsn.io`)" + middlewares: + - n8n-https-redirect + service: n8n + priority: 50 + + services: + n8n: + loadBalancer: + servers: + - url: "http://10.10.10.207:5678" + + middlewares: + n8n-https-redirect: + redirectScheme: + scheme: https + permanent: true +``` + +--- + +## Monitoring + +### Prometheus + +n8n exposes metrics at `http://10.10.10.207:5678/metrics` (if enabled) + +### Grafana + +n8n metrics can be visualized in Grafana dashboards + +### Uptime Monitoring + +Add to Pulse: https://pulse.htsn.io +- Monitor: https://n8n.htsn.io +- Check interval: 60s + +--- + +## Troubleshooting + +### n8n won't start + +```bash +ssh docker-host2 'docker logs n8n | tail -50' +ssh docker-host2 'docker logs n8n-postgres | tail -50' +``` + +### Database connection issues + +```bash +# Check postgres health +ssh docker-host2 'docker exec n8n-postgres pg_isready -U n8n' + +# Restart postgres +ssh docker-host2 'cd /opt/n8n && docker compose restart postgres' +``` + +### SSL/HTTPS issues + +```bash +# Check Traefik config +ssh root@10.10.10.250 'cat /etc/traefik/conf.d/n8n.yaml' + +# Reload Traefik +ssh root@10.10.10.250 'systemctl reload traefik' +``` + +### API not responding + +```bash +# Test API locally +curl -H "X-N8N-API-KEY: YOUR_KEY" http://10.10.10.207:5678/api/v1/workflows + +# Check if n8n container is healthy +ssh docker-host2 'docker ps | grep n8n' +``` + +--- + +## Integration Examples + +### Homelab Automation Ideas + +1. **Backup Notifications** - Send Telegram alerts when backups complete +2. **Server Monitoring** - Query Prometheus and alert on high CPU/memory +3. **Media Management** - Trigger Sonarr/Radarr downloads +4. **Home Assistant Integration** - Automate smart home workflows +5. **Git Webhooks** - Deploy changes from Gitea automatically +6. **Syncthing Monitoring** - Alert when sync folders get behind +7. **UPS Alerts** - Notify on power events from NUT + +--- + +## Security Notes + +- API key provides full access to all workflows and data +- Store API key securely (added to this doc for homelab reference) +- n8n credentials are encrypted at rest in PostgreSQL +- HTTPS enforced via Traefik +- No public internet exposure (only via Tailscale) + +--- + +## Quick Start + +**New to n8n?** Start here: **[N8N-INTEGRATIONS.md](N8N-INTEGRATIONS.md)** ⭐ + +This guide includes: +- ✅ Network access verification +- ✅ Credential setup for all homelab services +- ✅ 10 ready-to-use starter workflows +- ✅ Home Assistant, Prometheus, Syncthing, Telegram integrations +- ✅ Troubleshooting tips + +--- + +## Related Documentation + +- [n8n Homelab Integrations Guide](N8N-INTEGRATIONS.md) - **START HERE** +- [docker-host2 VM details](VMS.md) +- [Traefik reverse proxy](TRAEFIK.md) +- [IP Assignments](IP-ASSIGNMENTS.md) +- [Pulse Setup](PULSE-SETUP.md) + +**Last Updated:** 2025-12-26 diff --git a/PULSE-SETUP.md b/PULSE-SETUP.md new file mode 100644 index 0000000..d3957c1 --- /dev/null +++ b/PULSE-SETUP.md @@ -0,0 +1,69 @@ +# Add n8n and docker-host2 to Pulse Monitoring + +Pulse automatically monitors based on Prometheus targets, but you can also add custom HTTP monitors. + +## Quick Steps + +1. Open **https://pulse.htsn.io** in your browser +2. Login if required +3. Click **"+ Add Monitor"** or **"New Monitor"** + +--- + +## Monitor: n8n + +| Field | Value | +|-------|-------| +| **Name** | n8n Workflow Automation | +| **URL** | https://n8n.htsn.io | +| **Check Interval** | 60 seconds | +| **Monitor Type** | HTTP/HTTPS | +| **Expected Status** | 200 | +| **Timeout** | 10 seconds | +| **Alert After** | 2 failed checks | + +--- + +## Monitor: docker-host2 + +| Field | Value | +|-------|-------| +| **Name** | docker-host2 (node_exporter) | +| **URL** | http://10.10.10.207:9100/metrics | +| **Check Interval** | 60 seconds | +| **Monitor Type** | HTTP | +| **Expected Status** | 200 | +| **Expected Content** | `node_exporter` | +| **Timeout** | 5 seconds | +| **Alert After** | 2 failed checks | + +--- + +## Optional: docker-host2 SSH + +| Field | Value | +|-------|-------| +| **Name** | docker-host2 SSH | +| **Host** | 10.10.10.207 | +| **Port** | 22 | +| **Monitor Type** | TCP Port | +| **Check Interval** | 60 seconds | +| **Timeout** | 5 seconds | + +--- + +## Verification + +After adding monitors, you should see: +- ✅ Green status for both monitors +- Response time graphs +- Uptime percentage +- Alert history (should be empty) + +Access Pulse dashboard: **https://pulse.htsn.io** + +--- + +**Note:** Pulse may already be monitoring these services via Prometheus integration. Check existing monitors before adding duplicates. + +**Last Updated:** 2025-12-27 diff --git a/README.md b/README.md index c10c8fc..ca6140f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Documentation for Hutson's home infrastructure - two Proxmox servers running VMs | Document | Description | |----------|-------------| +| [GATEWAY.md](GATEWAY.md) | UniFi gateway monitoring, watchdog services, troubleshooting | | [VMS.md](VMS.md) | Complete VM/LXC inventory, specs, GPU passthrough | | [HARDWARE.md](HARDWARE.md) | Server specs, GPUs, network cards, HBAs | | [STORAGE.md](STORAGE.md) | ZFS pools, NFS/SMB shares, capacity planning | @@ -145,4 +146,4 @@ git push --- -**Last Updated**: 2025-12-22 +**Last Updated**: 2026-01-02 diff --git a/SSH-ACCESS.md b/SSH-ACCESS.md index 36507f1..2f7a3e7 100644 --- a/SSH-ACCESS.md +++ b/SSH-ACCESS.md @@ -26,12 +26,14 @@ Use these convenient aliases instead of IP addresses: | Host Alias | IP | User | Type | Notes | |------------|-----|------|------|-------| +| `ucg-fiber` / `gateway` | 10.10.10.1 | root | UniFi Gateway | Router/firewall | | `pve` | 10.10.10.120 | root | Proxmox | Primary server | | `pve2` | 10.10.10.102 | root | Proxmox | Secondary server | | `truenas` | 10.10.10.200 | root | VM | NAS/storage | | `saltbox` | 10.10.10.100 | hutson | VM | Media automation | | `lmdev1` | 10.10.10.111 | hutson | VM | AI/LLM development | -| `docker-host` | 10.10.10.206 | hutson | VM | Docker services | +| `docker-host` | 10.10.10.206 | hutson | VM | Docker services (PVE) | +| `docker-host2` | 10.10.10.207 | hutson | VM | Docker services (PVE2) - MetaMCP, n8n | | `fs-dev` | 10.10.10.5 | hutson | VM | Development | | `copyparty` | 10.10.10.201 | hutson | VM | File sharing | | `gitea-vm` | 10.10.10.220 | hutson | VM | Git server | @@ -100,6 +102,11 @@ Host docker-host User hutson IdentityFile ~/.ssh/homelab +Host docker-host2 + HostName 10.10.10.207 + User hutson + IdentityFile ~/.ssh/homelab + Host fs-dev HostName 10.10.10.5 User hutson @@ -143,25 +150,29 @@ Host findshyt Some systems don't support SSH key auth or have other limitations. -### UniFi Router (10.10.10.1) +### UniFi Router (10.10.10.1) - NOW USES KEY AUTH -**Issue**: Uses `keyboard-interactive` auth method, incompatible with `sshpass` -**Solution**: Use `expect` to automate password entry +**Host alias**: `ucg-fiber` or `gateway` + +**Status**: SSH key authentication now works (as of 2026-01-02) **Commands**: ```bash -# Run command on router -expect -c 'spawn ssh root@10.10.10.1 "hostname"; expect "Password:"; send "GrilledCh33s3#\r"; expect eof' +# Run command on router (using SSH key) +ssh ucg-fiber 'hostname' # Get ARP table (all device IPs) -expect -c 'spawn ssh root@10.10.10.1 "cat /proc/net/arp"; expect "Password:"; send "GrilledCh33s3#\r"; expect eof' +ssh ucg-fiber 'cat /proc/net/arp' # Check Tailscale status -expect -c 'spawn ssh root@10.10.10.1 "tailscale status"; expect "Password:"; send "GrilledCh33s3#\r"; expect eof' +ssh ucg-fiber 'tailscale status' + +# Check memory usage +ssh ucg-fiber 'free -m' ``` -**Why not key auth?**: UniFi router firmware doesn't persist SSH keys across reboots. +**Note**: Key may need to be re-deployed after firmware updates if UniFi clears authorized_keys. ### Windows PC (10.10.10.150) diff --git a/TRAEFIK.md b/TRAEFIK.md index 77af1d9..7d3cdf5 100644 --- a/TRAEFIK.md +++ b/TRAEFIK.md @@ -61,6 +61,7 @@ ssh pve 'pct exec 202 -- tail -f /var/log/traefik/traefik.log' | Gitea | git.htsn.io | 10.10.10.220:3000 | | Home Assistant | homeassistant.htsn.io | 10.10.10.110 | | LM Dev | lmdev.htsn.io | 10.10.10.111 | +| MetaMCP | metamcp.htsn.io | 10.10.10.207:12008 (docker-host2) | | Pi-hole | pihole.htsn.io | 10.10.10.200 | | TrueNAS | truenas.htsn.io | 10.10.10.200 | | Proxmox | pve.htsn.io | 10.10.10.120 | diff --git a/VMS.md b/VMS.md index b8e1d4c..1d01209 100644 --- a/VMS.md +++ b/VMS.md @@ -7,8 +7,8 @@ Complete inventory of all virtual machines and LXC containers across both Proxmo | Server | VMs | LXCs | Total | |--------|-----|------|-------| | **PVE** (10.10.10.120) | 6 | 3 | 9 | -| **PVE2** (10.10.10.102) | 2 | 0 | 2 | -| **Total** | **8** | **3** | **11** | +| **PVE2** (10.10.10.102) | 3 | 0 | 3 | +| **Total** | **9** | **3** | **12** | --- @@ -44,6 +44,7 @@ Complete inventory of all virtual machines and LXC containers across both Proxmo |------|------|-----|-------|-----|---------|---------|-----------------|------------| | **300** | gitea-vm | 10.10.10.220 | 2 | 4GB | nvme-mirror3 | Git server (Gitea) | - | ✅ Yes | | **301** | trading-vm | 10.10.10.221 | 16 | 32GB | nvme-mirror3 | AI trading platform | RTX A6000 | ✅ Yes | +| **302** | docker-host2 | 10.10.10.207 | 4 | 8GB | nvme-mirror3 | Docker host (n8n, automation) | - | ✅ Yes | ### LXC Containers diff --git a/client_secret_693027753314-hdjfnvfnarlcnehba6u8plbehv78rfh9.apps.googleusercontent.com.json b/client_secret_693027753314-hdjfnvfnarlcnehba6u8plbehv78rfh9.apps.googleusercontent.com.json new file mode 100644 index 0000000..0a82299 --- /dev/null +++ b/client_secret_693027753314-hdjfnvfnarlcnehba6u8plbehv78rfh9.apps.googleusercontent.com.json @@ -0,0 +1 @@ +{"web":{"client_id":"693027753314-hdjfnvfnarlcnehba6u8plbehv78rfh9.apps.googleusercontent.com","project_id":"spheric-method-482514-f8","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"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","redirect_uris":["https://my.home-assistant.io/redirect/oauth"]}} \ No newline at end of file diff --git a/data/scripts/internet-watchdog.sh b/data/scripts/internet-watchdog.sh new file mode 100644 index 0000000..3858a55 --- /dev/null +++ b/data/scripts/internet-watchdog.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Internet Watchdog - Reboots if internet is unreachable for 5 minutes +LOG_FILE="/var/log/internet-watchdog.log" +FAIL_COUNT=0 +MAX_FAILS=5 +CHECK_INTERVAL=60 + +log() { + echo "$(date "+%Y-%m-%d %H:%M:%S") - $1" >> "$LOG_FILE" +} + +check_internet() { + for endpoint in 1.1.1.1 8.8.8.8 208.67.222.222; do + if ping -c 1 -W 5 "$endpoint" > /dev/null 2>&1; then + return 0 + fi + done + return 1 +} + +log "Watchdog started" + +while true; do + if check_internet; then + if [ $FAIL_COUNT -gt 0 ]; then + log "Internet restored after $FAIL_COUNT failures" + fi + FAIL_COUNT=0 + else + FAIL_COUNT=$((FAIL_COUNT + 1)) + log "Internet check failed ($FAIL_COUNT/$MAX_FAILS)" + + if [ $FAIL_COUNT -ge $MAX_FAILS ]; then + log "CRITICAL: $MAX_FAILS consecutive failures - REBOOTING" + sync + sleep 2 + reboot + fi + fi + sleep $CHECK_INTERVAL +done diff --git a/data/scripts/memory-monitor.sh b/data/scripts/memory-monitor.sh new file mode 100644 index 0000000..34d8207 --- /dev/null +++ b/data/scripts/memory-monitor.sh @@ -0,0 +1,23 @@ +#!/bin/bash +LOG_DIR="/data/logs" +LOG_FILE="$LOG_DIR/memory-history.log" +mkdir -p "$LOG_DIR" + +while true; do + # Rotate if over 10MB + if [ -f "$LOG_FILE" ]; then + SIZE=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 10485760 ]; then + mv "$LOG_FILE" "$LOG_FILE.old" + fi + fi + + echo "========== $(date +%Y-%m-%d\ %H:%M:%S) ==========" >> "$LOG_FILE" + echo "--- MEMORY ---" >> "$LOG_FILE" + free -m >> "$LOG_FILE" + echo "--- TOP MEMORY PROCESSES ---" >> "$LOG_FILE" + ps -eo pid,rss,comm --sort=-rss | head -12 >> "$LOG_FILE" + echo "" >> "$LOG_FILE" + + sleep 600 +done