Add homelab monitoring, portfolio site, and vault tooling.
Some checks failed
CI / skip-ci-check (pull_request) Successful in 6s
CI / lint-and-test (pull_request) Failing after 9s
CI / ansible-validation (pull_request) Failing after 6s
CI / secret-scanning (pull_request) Successful in 5s
CI / dependency-scan (pull_request) Successful in 8s
CI / sast-scan (pull_request) Failing after 5s
CI / license-check (pull_request) Successful in 11s
CI / vault-check (pull_request) Failing after 6s
CI / playbook-test (pull_request) Failing after 6s
CI / container-scan (pull_request) Failing after 6s
CI / sonar-analysis (pull_request) Failing after 2s
CI / workflow-summary (pull_request) Successful in 4s
Some checks failed
CI / skip-ci-check (pull_request) Successful in 6s
CI / lint-and-test (pull_request) Failing after 9s
CI / ansible-validation (pull_request) Failing after 6s
CI / secret-scanning (pull_request) Successful in 5s
CI / dependency-scan (pull_request) Successful in 8s
CI / sast-scan (pull_request) Failing after 5s
CI / license-check (pull_request) Successful in 11s
CI / vault-check (pull_request) Failing after 6s
CI / playbook-test (pull_request) Failing after 6s
CI / container-scan (pull_request) Failing after 6s
CI / sonar-analysis (pull_request) Failing after 2s
CI / workflow-summary (pull_request) Successful in 4s
Document pve10 static IPs, monitoring stack, and site LXCs; add portfolio to inventory; Mailcow mailbox automation; vault import/export scripts; security audit guides and UniFi DHCP reference. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
9281f12a65
commit
de49b34cdc
37
.env.example
Normal file
37
.env.example
Normal file
@ -0,0 +1,37 @@
|
||||
# Copy to .env (gitignored): cp .env.example .env
|
||||
#
|
||||
# vault → .env: make vault-export-env
|
||||
# .env → vault: make vault-import-env
|
||||
# hosts → vault: make vault-pull-infra-secrets (SSH to monitoring/hermes, then import)
|
||||
#
|
||||
# Prefer vault for long-term storage; delete .env after export if you want.
|
||||
|
||||
# Mailcow (make mailcow-mailbox MAILBOX=alerts)
|
||||
MAILCOW_API_KEY=
|
||||
ALERTS_PASSWORD=
|
||||
|
||||
# Uptime Kuma @ 10.0.10.22:3001 (scripts/kuma-setup-smtp.sh)
|
||||
KUMA_URL=http://10.0.10.22:3001
|
||||
KUMA_USER=admin
|
||||
KUMA_PASSWORD=
|
||||
|
||||
# Kuma SMTP notification (after alerts@ mailbox exists)
|
||||
SMTP_HOST=mail.levkine.ca
|
||||
SMTP_PORT=587
|
||||
SMTP_USER=alerts@levkine.ca
|
||||
SMTP_PASS=
|
||||
SMTP_TO=idobkin@gmail.com
|
||||
|
||||
# Umami @ 10.0.10.22:3000 (admin UI password; DB pass is on LXC only)
|
||||
UMAMI_ADMIN_PASSWORD=
|
||||
|
||||
# Hermes Mattermost (not Telegram)
|
||||
MATTERMOST_URL=
|
||||
MATTERMOST_TOKEN=
|
||||
MATTERMOST_ALLOWED_USERS=
|
||||
|
||||
# Optional: same password on Proxmox / LXCs / caddy root (if you use one shared admin password)
|
||||
# PROXMOX_PASSWORD=
|
||||
# LXC_ROOT_PASSWORD=
|
||||
|
||||
# Per-mailbox: MAILBOX_notify_PASSWORD=
|
||||
@ -103,7 +103,8 @@ jobs:
|
||||
inventory = /tmp/ci-inventory.ini
|
||||
roles_path = /workspace/ilia/ansible/roles
|
||||
host_key_checking = False
|
||||
stdout_callback = yaml
|
||||
stdout_callback = default
|
||||
callback_result_format = yaml
|
||||
bin_ansible_callbacks = True
|
||||
retry_files_enabled = False
|
||||
interpreter_python = auto_silent
|
||||
@ -305,7 +306,8 @@ jobs:
|
||||
inventory = /tmp/ci-inventory.ini
|
||||
roles_path = /workspace/ilia/ansible/roles
|
||||
host_key_checking = False
|
||||
stdout_callback = yaml
|
||||
stdout_callback = default
|
||||
callback_result_format = yaml
|
||||
bin_ansible_callbacks = True
|
||||
retry_files_enabled = False
|
||||
interpreter_python = auto_silent
|
||||
|
||||
10
.gitignore
vendored
10
.gitignore
vendored
@ -17,6 +17,9 @@ id_rsa
|
||||
id_ed25519
|
||||
id_ecdsa
|
||||
|
||||
# Python venv (make bootstrap)
|
||||
.venv/
|
||||
|
||||
# Python bytecode
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
@ -34,4 +37,11 @@ Thumbs.db
|
||||
|
||||
.ansible/facts/
|
||||
|
||||
# Local data exports (Nextcloud, etc.)
|
||||
exports/
|
||||
|
||||
# Local secrets (Mailcow API, Kuma passwords) — never commit
|
||||
.env
|
||||
.env.local
|
||||
|
||||
node_modules/
|
||||
221
Makefile
221
Makefile
@ -1,4 +1,4 @@
|
||||
.PHONY: help bootstrap lint test check dev datascience inventory inventory-all local servers workstations clean status tailscale tailscale-check tailscale-dev tailscale-status create-vault create-vm monitoring
|
||||
.PHONY: help bootstrap lint test check dev datascience inventory inventory-all local servers workstations clean status tailscale tailscale-check tailscale-dev tailscale-status create-vault create-vm monitoring copy-ssh-key copy-ssh-keys copy-ssh-keys-ansible copy-ssh-key-mailcow bootstrap-root-ssh bootstrap-root-ssh-services bootstrap-root-ssh-failed mailcow-mailbox mailcow-create-alerts vault-import-env
|
||||
.DEFAULT_GOAL := help
|
||||
|
||||
## Colors for output
|
||||
@ -28,13 +28,27 @@ PYTHON_REQ := requirements.txt
|
||||
INVENTORY := inventories/production
|
||||
INVENTORY_HOSTS := $(INVENTORY)/hosts
|
||||
|
||||
# Python venv (created by `make bootstrap`)
|
||||
VENV := .venv
|
||||
ifneq ($(wildcard $(VENV)/bin/ansible-playbook),)
|
||||
export PATH := $(abspath $(VENV)/bin):$(PATH)
|
||||
ANSIBLE_VAULT := $(abspath $(VENV))/bin/ansible-vault
|
||||
else
|
||||
ANSIBLE_VAULT := ansible-vault
|
||||
endif
|
||||
|
||||
# Common ansible-playbook command with options
|
||||
ANSIBLE_PLAYBOOK := ansible-playbook -i $(INVENTORY)
|
||||
ANSIBLE_ARGS := --vault-password-file ~/.ansible-vault-pass
|
||||
# Note: sudo passwords are in vault files as ansible_become_password
|
||||
|
||||
## Auto-detect current host to exclude from remote operations
|
||||
CURRENT_IP := $(shell hostname -I | awk '{print $$1}')
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CURRENT_IP := $(shell ipconfig getifaddr en0 2>/dev/null || ipconfig getifaddr en1 2>/dev/null || echo "")
|
||||
else
|
||||
CURRENT_IP := $(shell hostname -I 2>/dev/null | awk '{print $$1}')
|
||||
endif
|
||||
# NOTE: inventory parsing may require vault secrets. Keep this best-effort and silent in CI.
|
||||
CURRENT_HOST := $(shell ansible-inventory --list --vault-password-file ~/.ansible-vault-pass 2>/dev/null | jq -r '._meta.hostvars | to_entries[] | select(.value.ansible_host == "$(CURRENT_IP)") | .key' 2>/dev/null | head -1)
|
||||
EXCLUDE_CURRENT := $(if $(CURRENT_HOST),--limit '!$(CURRENT_HOST)',)
|
||||
@ -59,37 +73,36 @@ help: ## Show this help message
|
||||
@echo " make maintenance-verbose GROUP=dev # Verbose maintenance on dev group"
|
||||
@echo ""
|
||||
|
||||
require-ansible: ## Verify ansible is available (run make bootstrap if missing)
|
||||
@command -v ansible-playbook >/dev/null 2>&1 && command -v ansible-vault >/dev/null 2>&1 || { \
|
||||
echo "$(RED)ansible-playbook/ansible-vault not found$(RESET)"; \
|
||||
echo "Run: $(BLUE)make bootstrap$(RESET)"; \
|
||||
exit 1; \
|
||||
}
|
||||
|
||||
bootstrap: ## Install all project dependencies from requirements files
|
||||
@echo "$(BOLD)Installing Project Dependencies$(RESET)"
|
||||
@echo ""
|
||||
@echo "$(YELLOW)Python Requirements ($(PYTHON_REQ)):$(RESET)"
|
||||
@if [ -f "$(PYTHON_REQ)" ]; then \
|
||||
if command -v pipx >/dev/null 2>&1; then \
|
||||
printf " %-30s " "Installing with pipx"; \
|
||||
if pipx install -r $(PYTHON_REQ) >/dev/null 2>&1; then \
|
||||
echo "$(GREEN)✓ Installed$(RESET)"; \
|
||||
else \
|
||||
echo "$(YELLOW)⚠ Some packages may have failed$(RESET)"; \
|
||||
fi; \
|
||||
elif command -v pip3 >/dev/null 2>&1; then \
|
||||
printf " %-30s " "Installing with pip3 --user"; \
|
||||
if pip3 install --user -r $(PYTHON_REQ) >/dev/null 2>&1; then \
|
||||
echo "$(GREEN)✓ Installed$(RESET)"; \
|
||||
else \
|
||||
printf " %-30s " "Trying with --break-system-packages"; \
|
||||
if pip3 install --break-system-packages -r $(PYTHON_REQ) >/dev/null 2>&1; then \
|
||||
echo "$(GREEN)✓ Installed$(RESET)"; \
|
||||
else \
|
||||
echo "$(RED)✗ Failed$(RESET)"; \
|
||||
fi; \
|
||||
fi; \
|
||||
else \
|
||||
printf " %-30s " "Python packages"; \
|
||||
echo "$(YELLOW)⚠ Skipped (pip3/pipx not found)$(RESET)"; \
|
||||
fi; \
|
||||
else \
|
||||
@echo "$(YELLOW)Python venv ($(VENV))/$(PYTHON_REQ):$(RESET)"
|
||||
@if [ ! -f "$(PYTHON_REQ)" ]; then \
|
||||
printf " %-30s " "$(PYTHON_REQ)"; \
|
||||
echo "$(RED)✗ File not found$(RESET)"; \
|
||||
elif ! command -v python3 >/dev/null 2>&1; then \
|
||||
printf " %-30s " "Python venv"; \
|
||||
echo "$(RED)✗ python3 not found$(RESET)"; \
|
||||
else \
|
||||
if [ ! -d "$(VENV)" ]; then \
|
||||
printf " %-30s " "Creating venv"; \
|
||||
python3 -m venv "$(VENV)" && echo "$(GREEN)✓ Created$(RESET)" || { echo "$(RED)✗ Failed$(RESET)"; exit 1; }; \
|
||||
fi; \
|
||||
printf " %-30s " "Installing packages"; \
|
||||
if "$(VENV)/bin/pip" install -r "$(PYTHON_REQ)" >/dev/null 2>&1; then \
|
||||
echo "$(GREEN)✓ Installed$(RESET)"; \
|
||||
echo " $(BLUE)Ansible:$(RESET) $(abspath $(VENV))/bin/ansible-playbook"; \
|
||||
else \
|
||||
echo "$(RED)✗ Failed$(RESET)"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "$(YELLOW)Node.js Dependencies (package.json):$(RESET)"
|
||||
@ -107,7 +120,9 @@ bootstrap: ## Install all project dependencies from requirements files
|
||||
@echo ""
|
||||
@echo "$(YELLOW)Ansible Collections ($(COLLECTIONS_REQ)):$(RESET)"
|
||||
@if [ -f "$(COLLECTIONS_REQ)" ]; then \
|
||||
ansible-galaxy collection install -r $(COLLECTIONS_REQ) 2>&1 | grep -E "(Installing|Skipping|ERROR)" | while read line; do \
|
||||
GALAXY="$$(command -v ansible-galaxy)"; \
|
||||
[ -x "$(VENV)/bin/ansible-galaxy" ] && GALAXY="$(abspath $(VENV))/bin/ansible-galaxy"; \
|
||||
"$$GALAXY" collection install -r $(COLLECTIONS_REQ) 2>&1 | grep -E "(Installing|Skipping|ERROR)" | while read line; do \
|
||||
if echo "$$line" | grep -q "Installing"; then \
|
||||
collection=$$(echo "$$line" | awk '{print $$2}' | sed 's/:.*//'); \
|
||||
printf " $(GREEN)✓ %-30s$(RESET) Installed\n" "$$collection"; \
|
||||
@ -117,7 +132,7 @@ bootstrap: ## Install all project dependencies from requirements files
|
||||
elif echo "$$line" | grep -q "ERROR"; then \
|
||||
printf " $(RED)✗ Error: $$line$(RESET)\n"; \
|
||||
fi; \
|
||||
done || ansible-galaxy collection install -r $(COLLECTIONS_REQ); \
|
||||
done || "$$GALAXY" collection install -r $(COLLECTIONS_REQ); \
|
||||
else \
|
||||
printf " %-30s " "$(COLLECTIONS_REQ)"; \
|
||||
echo "$(RED)✗ File not found$(RESET)"; \
|
||||
@ -265,6 +280,10 @@ servers: ## Run baseline server playbook (usage: make servers [GROUP=services] [
|
||||
$(ANSIBLE_PLAYBOOK) $(PLAYBOOK_SERVERS); \
|
||||
fi
|
||||
|
||||
caddy-auth: require-ansible ## Ensure auth.levkin.ca reverse proxy on Caddy VM
|
||||
@echo "$(YELLOW)Updating Caddy for Authentik...$(RESET)"
|
||||
$(ANSIBLE_PLAYBOOK) playbooks/caddy-auth-authentik.yml $(ANSIBLE_ARGS)
|
||||
|
||||
workstations: ## Run workstation baseline (usage: make workstations [GROUP=dev] [HOST=dev01])
|
||||
@echo "$(YELLOW)Applying workstation baseline...$(RESET)"
|
||||
@EXTRA=""; \
|
||||
@ -426,7 +445,7 @@ apps: ## Install applications only
|
||||
$(ANSIBLE_PLAYBOOK) $(PLAYBOOK_WORKSTATIONS) --tags apps
|
||||
|
||||
# Connectivity targets
|
||||
ping: auto-fallback ## Ping hosts with colored output (usage: make ping [GROUP=dev] [HOST=dev01])
|
||||
ping: require-ansible auto-fallback ## Ping hosts with colored output (usage: make ping [GROUP=dev] [HOST=dev01])
|
||||
ifdef HOST
|
||||
@echo "$(YELLOW)Pinging host: $(HOST)$(RESET)"
|
||||
@ansible $(HOST) -m ping --one-line | while read line; do \
|
||||
@ -543,16 +562,25 @@ tailscale-status: ## Check Tailscale status on all machines
|
||||
done
|
||||
|
||||
# Vault management
|
||||
edit-vault: ## Edit encrypted host vars (usage: make edit-vault HOST=dev01)
|
||||
edit-vault: require-ansible ## Edit encrypted host vars (usage: make edit-vault HOST=KrakenMint)
|
||||
ifndef HOST
|
||||
@echo "$(RED)Error: HOST parameter required$(RESET)"
|
||||
@echo "Usage: make edit-vault HOST=dev01"
|
||||
@echo "Usage: make edit-vault HOST=KrakenMint"
|
||||
@exit 1
|
||||
endif
|
||||
ansible-vault edit host_vars/$(HOST).yml
|
||||
@vault_file="$(INVENTORY)/host_vars/$(HOST)/vault.yml"; \
|
||||
if [ ! -f "$$vault_file" ]; then vault_file="$(INVENTORY)/host_vars/$(HOST).yml"; fi; \
|
||||
if [ ! -f "$$vault_file" ]; then \
|
||||
echo "$(RED)No vault file for $(HOST):$(RESET)"; \
|
||||
echo " $(INVENTORY)/host_vars/$(HOST)/vault.yml"; \
|
||||
echo " $(INVENTORY)/host_vars/$(HOST).yml"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
echo "$(BLUE)Editing $$vault_file$(RESET)"; \
|
||||
$(ANSIBLE_VAULT) edit "$$vault_file"
|
||||
|
||||
edit-group-vault: ## Edit encrypted group vars (usage: make edit-group-vault)
|
||||
ansible-vault edit inventories/production/group_vars/all/vault.yml
|
||||
edit-group-vault: require-ansible ## Edit encrypted group vars (usage: make edit-group-vault)
|
||||
$(ANSIBLE_VAULT) edit $(INVENTORY)/group_vars/all/vault.yml
|
||||
|
||||
|
||||
copy-ssh-key: ## Copy SSH key to specific host (usage: make copy-ssh-key HOST=giteaVM)
|
||||
@ -562,19 +590,128 @@ ifndef HOST
|
||||
@exit 1
|
||||
endif
|
||||
@echo "$(YELLOW)Copying SSH key to $(HOST)...$(RESET)"
|
||||
@ip=$$(ansible-inventory --list | jq -r "._meta.hostvars.$(HOST).ansible_host // empty" 2>/dev/null); \
|
||||
user=$$(ansible-inventory --list | jq -r "._meta.hostvars.$(HOST).ansible_user // empty" 2>/dev/null); \
|
||||
if [ -n "$$ip" ] && [ "$$ip" != "null" ] && [ -n "$$user" ] && [ "$$user" != "null" ]; then \
|
||||
@ip=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r --arg h "$(HOST)" '._meta.hostvars[$$h].ansible_host // empty'); \
|
||||
user=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r --arg h "$(HOST)" '._meta.hostvars[$$h].ansible_user // empty'); \
|
||||
if [ -z "$$ip" ] || [ "$$ip" = "null" ]; then \
|
||||
ip=$$(awk -v h="$(HOST)" '$$1==h {print $$2}' $(INVENTORY_HOSTS) | sed 's/ansible_host=//'); \
|
||||
fi; \
|
||||
if [ -z "$$user" ] || [ "$$user" = "null" ]; then \
|
||||
user=$$(awk -v h="$(HOST)" '$$1==h {for(i=2;i<=NF;i++) if($$i~/^ansible_user=/) {sub(/ansible_user=/,"",$$i); print $$i; exit}}' $(INVENTORY_HOSTS)); \
|
||||
fi; \
|
||||
if [ -n "$$ip" ] && [ -n "$$user" ]; then \
|
||||
echo "Target: $$user@$$ip"; \
|
||||
ssh-copy-id $$user@$$ip; \
|
||||
ssh-copy-id -i "$${SSH_PUBLIC_KEY:-$$HOME/.ssh/id_ed25519.pub}" "$$user@$$ip"; \
|
||||
else \
|
||||
echo "$(RED)Could not determine IP or user for $(HOST)$(RESET)"; \
|
||||
echo "Check your inventory and host_vars"; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
create-vault: ## Create encrypted vault file for secrets (passwords, auth keys, etc.)
|
||||
copy-ssh-keys: ## Copy SSH key to all inventory hosts (usage: make copy-ssh-keys [GROUP=services])
|
||||
@echo "$(YELLOW)Copying SSH key to inventory hosts...$(RESET)"
|
||||
@echo "Using key: $${SSH_PUBLIC_KEY:-$$HOME/.ssh/id_ed25519.pub}"
|
||||
@echo "$(YELLOW)You will be prompted for each host's password (last time).$(RESET)"
|
||||
@failed=0; ok=0; \
|
||||
if [ -n "$(GROUP)" ]; then \
|
||||
hosts=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r ".\"$(GROUP)\".hosts[]? // empty"); \
|
||||
else \
|
||||
hosts=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r '._meta.hostvars | keys[]' | grep -v '^localhost$$' | sort); \
|
||||
fi; \
|
||||
if [ -z "$$hosts" ]; then \
|
||||
if [ -n "$(GROUP)" ]; then \
|
||||
hosts=$$(awk -v g="$(GROUP)" 'BEGIN{ing=0} /^\[/ {ing=($$0=="["g"]"); next} ing && /^[a-zA-Z]/ {print $$1}' $(INVENTORY_HOSTS)); \
|
||||
else \
|
||||
hosts=$$(awk '/^\[/ {next} /^[a-zA-Z]/ && $$1!="localhost" {print $$1}' $(INVENTORY_HOSTS)); \
|
||||
fi; \
|
||||
fi; \
|
||||
for host in $$hosts; do \
|
||||
echo ""; echo "$(BLUE)==> $$host$(RESET)"; \
|
||||
if $(MAKE) --no-print-directory copy-ssh-key HOST=$$host; then ok=$$((ok+1)); else failed=$$((failed+1)); fi; \
|
||||
done; \
|
||||
echo ""; \
|
||||
echo "$(GREEN)Done: $$ok succeeded$(RESET), $(RED)$$failed failed$(RESET)"; \
|
||||
[ $$failed -eq 0 ]
|
||||
|
||||
copy-ssh-keys-ansible: require-ansible ## Copy SSH key via Ansible (usage: make copy-ssh-keys-ansible [GROUP=services] [HOST=dev01])
|
||||
@echo "$(YELLOW)Deploying SSH key with Ansible (may prompt for SSH password)...$(RESET)"
|
||||
@limit="all:!local"; \
|
||||
[ -n "$(GROUP)" ] && limit="$(GROUP)"; \
|
||||
[ -n "$(HOST)" ] && limit="$(HOST)"; \
|
||||
$(ANSIBLE_PLAYBOOK) playbooks/ssh-keys.yml $(ANSIBLE_ARGS) --limit "$$limit" --ask-pass
|
||||
|
||||
copy-ssh-key-mailcow: ## Copy SSH key to Mailcow VM (root@10.0.10.132 on pve201; prompts for root password once)
|
||||
@$(MAKE) --no-print-directory copy-ssh-key HOST=mailcow
|
||||
|
||||
bootstrap-root-ssh-caddy: ## Bootstrap root on caddy via su + vault_lxc_root_password
|
||||
@chmod +x scripts/bootstrap-root-ssh-su-password.sh scripts/load-vault-lxc-root-password.sh
|
||||
@. scripts/load-vault-lxc-root-password.sh; ./scripts/bootstrap-root-ssh-su-password.sh caddy
|
||||
|
||||
bootstrap-root-ssh: ## SSH as ladmin, su to root, install root key (usage: make bootstrap-root-ssh HOST=listmonk)
|
||||
ifndef HOST
|
||||
@echo "$(RED)Error: HOST parameter required$(RESET)"
|
||||
@echo "Usage: make bootstrap-root-ssh HOST=listmonk"
|
||||
@exit 1
|
||||
endif
|
||||
@chmod +x scripts/bootstrap-root-ssh.sh
|
||||
@BOOTSTRAP_USER="$(BOOTSTRAP_USER)" TARGET_USER="$(TARGET_USER)" \
|
||||
scripts/bootstrap-root-ssh.sh "$(HOST)"
|
||||
|
||||
bootstrap-root-ssh-services: ## Bootstrap root SSH via ladmin (caddy, listmonk, vikanjans)
|
||||
@chmod +x scripts/bootstrap-root-ssh.sh
|
||||
@failed=0; ok=0; \
|
||||
for host in caddy listmonk vikanjans; do \
|
||||
echo ""; echo "$(BLUE)==> $$host$(RESET)"; \
|
||||
if BOOTSTRAP_USER="$(BOOTSTRAP_USER)" scripts/bootstrap-root-ssh.sh "$$host"; then \
|
||||
ok=$$((ok+1)); \
|
||||
else \
|
||||
failed=$$((failed+1)); \
|
||||
fi; \
|
||||
done; \
|
||||
echo ""; echo "$(GREEN)Done: $$ok succeeded$(RESET), $(RED)$$failed failed$(RESET)"; \
|
||||
[ $$failed -eq 0 ]
|
||||
|
||||
mailcow-mailbox: ## Create Mailcow mailbox (usage: make mailcow-mailbox MAILBOX=alerts)
|
||||
ifndef MAILBOX
|
||||
@echo "$(RED)Error: MAILBOX required$(RESET)"
|
||||
@echo "Usage: make mailcow-mailbox MAILBOX=alerts"
|
||||
@echo "Define mailboxes in inventories/production/group_vars/all/mailcow.yml"
|
||||
@exit 1
|
||||
endif
|
||||
@chmod +x scripts/run-mailcow-mailbox.sh
|
||||
@MAILBOX="$(MAILBOX)" ./scripts/run-mailcow-mailbox.sh
|
||||
|
||||
mailcow-create-alerts: ## Alias for make mailcow-mailbox MAILBOX=alerts
|
||||
@$(MAKE) --no-print-directory mailcow-mailbox MAILBOX=alerts
|
||||
|
||||
vault-pull-infra-secrets: ## Pull Umami/Mattermost from hosts → .env → vault (not vault→.env)
|
||||
@chmod +x scripts/vault-pull-infra-secrets.sh scripts/vault-import-env.sh
|
||||
@./scripts/vault-pull-infra-secrets.sh
|
||||
|
||||
vault-export-env: ## Write vault secrets into .env (keeps existing non-empty keys)
|
||||
@chmod +x scripts/vault-export-env.sh
|
||||
@./scripts/vault-export-env.sh "$(or $(ENV_FILE),.env)"
|
||||
|
||||
vault-import-env: ## Merge .env secrets into Ansible vault (usage: make vault-import-env [ENV_FILE=.env])
|
||||
@chmod +x scripts/vault-import-env.sh
|
||||
@ENV_FILE="$(or $(ENV_FILE),.env)" scripts/vault-import-env.sh "$(or $(ENV_FILE),.env)"
|
||||
|
||||
bootstrap-root-ssh-failed: ## Bootstrap root SSH on hosts that failed direct root copy-ssh-keys
|
||||
@chmod +x scripts/bootstrap-root-ssh.sh
|
||||
@failed=0; ok=0; \
|
||||
for host in caddy listmonk vikanjans n8n qBittorrent actual caseware auto mailcow; do \
|
||||
echo ""; echo "$(BLUE)==> $$host$(RESET)"; \
|
||||
if BOOTSTRAP_USER="$(BOOTSTRAP_USER)" scripts/bootstrap-root-ssh.sh "$$host"; then \
|
||||
ok=$$((ok+1)); \
|
||||
else \
|
||||
failed=$$((failed+1)); \
|
||||
fi; \
|
||||
done; \
|
||||
echo ""; echo "$(GREEN)Done: $$ok succeeded$(RESET), $(RED)$$failed failed$(RESET)"; \
|
||||
[ $$failed -eq 0 ]
|
||||
|
||||
create-vault: require-ansible ## Create encrypted vault file for secrets (passwords, auth keys, etc.)
|
||||
@echo "$(YELLOW)Creating vault file for storing secrets...$(RESET)"
|
||||
ansible-vault create group_vars/all/vault.yml
|
||||
$(ANSIBLE_VAULT) create $(INVENTORY)/group_vars/all/vault.yml
|
||||
@echo "$(GREEN)✓ Vault file created. Add your secrets here (e.g. vault_tailscale_auth_key)$(RESET)"
|
||||
|
||||
create-vm: ## Create Ansible controller VM on Proxmox
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
inventory = inventories/production
|
||||
roles_path = roles
|
||||
host_key_checking = False
|
||||
stdout_callback = yaml
|
||||
stdout_callback = default
|
||||
callback_result_format = yaml
|
||||
bin_ansible_callbacks = True
|
||||
retry_files_enabled = False
|
||||
gathering = smart
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
HOSTS_FILE="inventories/production/hosts"
|
||||
TIMEOUT=3
|
||||
CHANGED=false
|
||||
UNAME_S="$(uname -s)"
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
@ -18,10 +19,12 @@ echo "=================================================================="
|
||||
# Function to test IP connectivity
|
||||
test_ip() {
|
||||
local ip="$1"
|
||||
if ping -c 1 -W "$TIMEOUT" "$ip" >/dev/null 2>&1; then
|
||||
return 0
|
||||
if [[ "$UNAME_S" == "Darwin" ]]; then
|
||||
# macOS: -W is wait time in milliseconds
|
||||
ping -c 1 -W $((TIMEOUT * 1000)) "$ip" >/dev/null 2>&1
|
||||
else
|
||||
return 1
|
||||
# Linux: -W is timeout in seconds
|
||||
ping -c 1 -W "$TIMEOUT" "$ip" >/dev/null 2>&1
|
||||
fi
|
||||
}
|
||||
|
||||
@ -31,7 +34,7 @@ test_ssh() {
|
||||
local ip="$2"
|
||||
local user="$3"
|
||||
|
||||
if timeout 5 ssh -o ConnectTimeout=3 -o BatchMode=yes "$user@$ip" exit >/dev/null 2>&1; then
|
||||
if ssh -o ConnectTimeout=3 -o BatchMode=yes "$user@$ip" exit >/dev/null 2>&1; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
@ -46,11 +49,14 @@ switch_to_fallback() {
|
||||
|
||||
echo -e " ${YELLOW}→ Switching $hostname to fallback IP: $fallback_ip${NC}"
|
||||
|
||||
# Use sed to replace the primary IP with fallback IP
|
||||
sed -i "s/$hostname ansible_host=$primary_ip/$hostname ansible_host=$fallback_ip/" "$HOSTS_FILE"
|
||||
|
||||
# Remove the fallback attribute since we're now using it as primary
|
||||
sed -i "s/ ansible_host_fallback=$fallback_ip//" "$HOSTS_FILE"
|
||||
# Use sed to replace the primary IP with fallback IP (BSD/GNU compatible)
|
||||
if [[ "$UNAME_S" == "Darwin" ]]; then
|
||||
sed -i '' "s/$hostname ansible_host=$primary_ip/$hostname ansible_host=$fallback_ip/" "$HOSTS_FILE"
|
||||
sed -i '' "s/ ansible_host_fallback=$fallback_ip//" "$HOSTS_FILE"
|
||||
else
|
||||
sed -i "s/$hostname ansible_host=$primary_ip/$hostname ansible_host=$fallback_ip/" "$HOSTS_FILE"
|
||||
sed -i "s/ ansible_host_fallback=$fallback_ip//" "$HOSTS_FILE"
|
||||
fi
|
||||
|
||||
CHANGED=true
|
||||
}
|
||||
@ -66,9 +72,10 @@ while IFS= read -r line; do
|
||||
# Parse host entry
|
||||
if [[ "$line" =~ ansible_host= ]]; then
|
||||
hostname=$(echo "$line" | awk '{print $1}')
|
||||
primary_ip=$(echo "$line" | grep -oP 'ansible_host=\K[^\s]+')
|
||||
fallback_ip=$(echo "$line" | grep -oP 'ansible_host_fallback=\K[^\s]+' || echo "")
|
||||
user=$(echo "$line" | grep -oP 'ansible_user=\K[^\s]+' || echo "root")
|
||||
primary_ip=$(echo "$line" | sed -n 's/.*ansible_host=\([^[:space:]]*\).*/\1/p')
|
||||
fallback_ip=$(echo "$line" | sed -n 's/.*ansible_host_fallback=\([^[:space:]]*\).*/\1/p')
|
||||
user=$(echo "$line" | sed -n 's/.*ansible_user=\([^[:space:]]*\).*/\1/p')
|
||||
[[ -z "$user" ]] && user="root"
|
||||
|
||||
echo -n "Testing $hostname ($primary_ip)... "
|
||||
|
||||
|
||||
60
docs/guides/ansible-vault-secrets.md
Normal file
60
docs/guides/ansible-vault-secrets.md
Normal file
@ -0,0 +1,60 @@
|
||||
# Encrypted secrets in this project
|
||||
|
||||
Ansible Vault is the standard way to store and share secrets with this repo. Plain `.env` files are gitignored and meant only as a **temporary** import path on your machine.
|
||||
|
||||
## Recommended workflow
|
||||
|
||||
1. **Never commit** `.env`, API keys, or passwords.
|
||||
2. Store secrets in `inventories/production/group_vars/all/vault.yml` (encrypted).
|
||||
3. Edit with `make edit-group-vault` (uses `~/.ansible-vault-pass` on your workstation).
|
||||
4. Teammates need the same vault password file out-of-band (password manager, not git).
|
||||
|
||||
## One-time import from `.env`
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# fill MAILCOW_API_KEY, ALERTS_PASSWORD, etc.
|
||||
make vault-import-env
|
||||
rm .env # optional after import
|
||||
```
|
||||
|
||||
`make vault-import-env` merges supported keys into the vault and re-encrypts the file.
|
||||
|
||||
## Mailcow mailboxes (dynamic)
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `group_vars/all/mailcow.yml` | Mailbox names, local parts, quotas (no secrets) |
|
||||
| `vault.yml` | `vault_mailcow_api_key`, `vault_mailcow_mailbox_passwords` |
|
||||
|
||||
```bash
|
||||
make mailcow-mailbox MAILBOX=alerts
|
||||
```
|
||||
|
||||
Add a new mailbox:
|
||||
|
||||
1. In `mailcow.yml` under `mailcow_mailboxes:` add e.g. `notify: { local_part: notify, name: Notify, quota: 512, vault_password_key: notify }`
|
||||
2. In vault: `vault_mailcow_mailbox_passwords.notify: "..."` (via `make edit-group-vault`)
|
||||
3. `make mailcow-mailbox MAILBOX=notify`
|
||||
|
||||
## Can `.env` itself be encrypted?
|
||||
|
||||
Yes, but Ansible projects usually skip that pattern:
|
||||
|
||||
| Approach | Use when |
|
||||
|----------|----------|
|
||||
| **Ansible Vault** (`vault.yml`) | Default for this repo — works with playbooks and `make` targets |
|
||||
| **`ansible-vault encrypt .env`** | Produces `.env` vault blob; you must `ansible-vault view .env` or decrypt to a temp file before tools read it — awkward for shell scripts |
|
||||
| **Password manager / 1Password CLI** | Personal machine only, not for CI/ansible runs |
|
||||
| **SOPS / Mozilla SOPS** | Teams that want encrypted YAML/JSON in git with KMS/PGP — heavier setup |
|
||||
|
||||
**Sharing encrypted secrets with others:** share the **vault password** (or per-host vault pass) securely once; they clone the repo and use the same encrypted `vault.yml`. Do not email `.env` files.
|
||||
|
||||
## Encrypting a single value (without opening the whole file)
|
||||
|
||||
```bash
|
||||
ansible-vault encrypt_string 'secret-value' --name 'vault_my_secret' \
|
||||
--vault-password-file ~/.ansible-vault-pass
|
||||
```
|
||||
|
||||
Paste the output into `vault.yml` inside the encrypted file, or into a vars file that is entirely vault-encrypted.
|
||||
62
docs/guides/homelab-status-2026-05-22.md
Normal file
62
docs/guides/homelab-status-2026-05-22.md
Normal file
@ -0,0 +1,62 @@
|
||||
# Homelab status — 2026-05-22
|
||||
|
||||
Quick checklist after monitoring / sites / git pass.
|
||||
|
||||
## Done (automation)
|
||||
|
||||
| Item | Notes |
|
||||
|------|--------|
|
||||
| Mailcow `alerts@levkine.ca` | Created via API |
|
||||
| Kuma + Dockge + Umami | LXC 218 @ `10.0.10.22`; Dockge stack **monitoring** active |
|
||||
| Old Kuma pve201 LXC 305 | Stopped, `onboot` off |
|
||||
| `stats.levkin.ca` | Caddy → Umami `:3000` |
|
||||
| Tracking scripts | caseware + auto + portfolio (`iliadobkin.com`) |
|
||||
| Portfolio `iliadobkin.com` | Migrated pve201 LXC **306** → pve10 LXC **219** @ `10.0.10.106`; Caddy → nginx `:80` |
|
||||
| Kuma SMTP | Working (user confirmed) |
|
||||
| Git remote | `git@git.levkin.ca:ilia/...` (SSH → `10.0.10.169` via `~/.ssh/config` on site LXCs) |
|
||||
| auto repo | Pushed/pulled on `git.levkin.ca` |
|
||||
| caseware repo | Pushed to Gitea via bundle on server; LXCs pull via internal SSH |
|
||||
| Vault | Mailcow, Umami, Mattermost in vault; `make vault-export-env` → `.env`; `make vault-pull-infra-secrets` = hosts → vault |
|
||||
| Caddy root SSH | Works (`make bootstrap-root-ssh-caddy`) |
|
||||
| Hermes Mattermost | `mattermost.env` on VM; Telegram optional/off |
|
||||
|
||||
## Your list — still to do
|
||||
|
||||
### You (UI / hardware / DNS)
|
||||
|
||||
- [x] **Kuma SMTP** — working
|
||||
- [ ] **UniFi DHCP reservations** — [unifi-static-dhcp.md](unifi-static-dhcp.md) @ https://192.168.2.1/
|
||||
- [ ] **Cal.com → Authentik OIDC** — first SSO (~1–2 h) — [levkin-selfhost-plan-2.md](levkin-selfhost-plan-2.md)
|
||||
- [ ] **Nextcloud VM 201 retire** — remove Kuma monitor, Caddy `nextcloud.levkin.ca`, stop VM
|
||||
- [ ] **NAS.SP00 disk replace** — then start Jellyfin (VM 101)
|
||||
- [x] **Gitea deploy key (portfolio)** — `git pull` works on LXC 219; Gitea VM SSH fixed (`/home/git/.ssh/authorized_keys` + `sudo` to `gitea`)
|
||||
- [ ] **`.env`** — optional mirror: `make vault-export-env` (vault already has secrets)
|
||||
- [ ] **Rotate** any secrets pasted in chat (Hermes token, etc.)
|
||||
|
||||
### Later / defer
|
||||
|
||||
- [ ] Caddy → edge LXC `.20`
|
||||
- [ ] Immich, Crater, Beszel
|
||||
- [ ] Public SSH for `git.levkin.ca:22` (optional Caddy `layer4` or DNS split)
|
||||
|
||||
## Site LXCs (marketing)
|
||||
|
||||
| VMID | Name | IP | Git remote |
|
||||
|------|------|-----|------------|
|
||||
| 215 | caseware | 10.0.10.105 | `git@git.levkin.ca:ilia/caseware.git` |
|
||||
| 216 | auto | 10.0.10.59 | `git@git.levkin.ca:ilia/auto.git` |
|
||||
| 219 | portfolio | 10.0.10.106 | `git@git.levkin.ca:ilia/sdetProfile.git` |
|
||||
|
||||
**Git SSH note:** `git.levkin.ca` in the URL; traffic goes to **10.0.10.169:22** (not `10.0.30.169`, not public `:22`).
|
||||
|
||||
```ssh
|
||||
# On each site LXC /root/.ssh/config
|
||||
Host git.levkin.ca
|
||||
HostName 10.0.10.169
|
||||
User git
|
||||
IdentityFile ~/.ssh/id_ed25519
|
||||
```
|
||||
|
||||
## Dockge
|
||||
|
||||
Stack **monitoring** in UI = correct. Compose at `/opt/stacks/monitoring/compose.yaml`. Live stack also at `/opt/monitoring` (same containers). Use Dockge for edits/restarts; avoid starting a second copy.
|
||||
140
docs/guides/host-list.md
Normal file
140
docs/guides/host-list.md
Normal file
@ -0,0 +1,140 @@
|
||||
# Host list — Proxmox guests (source of truth)
|
||||
|
||||
**Node:** PVENAS (`pve10` @ `10.0.10.10`)
|
||||
**Audited:** 2026-05-22 (Phase 0 IP pass + monitoring LXC 218 provisioned)
|
||||
**LAN:** `10.0.10.0/24`, gateway `10.0.10.1`
|
||||
|
||||
Update this file whenever a guest is created, migrated, or re-IP’d. See [levkin-selfhost-plan-2.md](levkin-selfhost-plan-2.md) for IP range policy.
|
||||
|
||||
---
|
||||
|
||||
## IP range plan (10.0.10.0/24)
|
||||
|
||||
| Range | Reserved for |
|
||||
|-------|----------------|
|
||||
| `.1–.9` | Network gear |
|
||||
| `.10–.19` | Proxmox host(s) + PBS |
|
||||
| `.20–.39` | Edge / identity / comms |
|
||||
| `.40–.79` | Application LXCs / VMs |
|
||||
| `.80–.99` | Media VMs |
|
||||
| `.100–.199` | DHCP pool (clients) |
|
||||
| `.200–.249` | Labs / heavy VMs |
|
||||
| `.250–.254` | Reserved |
|
||||
|
||||
**Rollout reservations (free):** `.20` edge LXC
|
||||
|
||||
---
|
||||
|
||||
## Proxmox host
|
||||
|
||||
| VMID | Name | Role | Current IP | Target static IP | DHCP/Static | Notes |
|
||||
|------|------|------|------------|------------------|-------------|-------|
|
||||
| — | **pve10** | Proxmox (PVENAS) | `10.0.10.10/24` | `.10` | Static | This node |
|
||||
|
||||
---
|
||||
|
||||
## LXCs (pve10)
|
||||
|
||||
| VMID | Name | Plan group | Current IP | Target static IP | DHCP/Static | MAC | Notes |
|
||||
|------|------|------------|------------|------------------|-------------|-----|-------|
|
||||
| 210 | cal | business | `10.0.10.228/24` | `10.0.10.228/24` | ✅ **Static** | `BC:24:11:DD:F8:7C` | Cal.com — `pct set` applied; in Ansible `hosts` |
|
||||
| 215 | caseware | **marketing site** | `10.0.10.105/24` | `10.0.10.105/24` | ✅ **Static** | `BC:24:11:72:04:53` | Static HTML `/var/www/caseware` → `caseware.levkin.ca` |
|
||||
| 216 | auto | **marketing site** | `10.0.10.59/24` | `10.0.10.59/24` | ✅ **Static** | `BC:24:11:43:F0:86` | Static HTML `/var/www/auto` → `auto.levkin.ca` |
|
||||
| 219 | portfolio | **marketing site** | `10.0.10.106/24` | `10.0.10.106/24` | ✅ **Static** | `BC:24:11:DF:94:32` | Static HTML `/var/www/portfolio` → `iliadobkin.com` (migrated from pve201 LXC 306) |
|
||||
| 217 | identity | identity | `10.0.10.21/24` | `10.0.10.21/24` | ✅ **Static** | `BC:24:11:3C:85:45` | Authentik + Postgres + Redis; `auth.levkin.ca` via Caddy |
|
||||
| 218 | monitoring | monitoring | `10.0.10.22/24` | `10.0.10.22/24` | ✅ **Static** | `BC:24:11:54:43:13` | Uptime Kuma `:3001`, Dockge `:5001`, Umami `:3000` — see [monitoring-stack.md](monitoring-stack.md) |
|
||||
|
||||
**pve201 (not pve10):** LXC **305** `kuma-debian` @ `10.0.10.197` — **stopped 2026-05-22** (replaced by monitoring LXC 218). `onboot` disabled. LXC **306** `portfolio` — **destroyed/purged 2026-05-22** (now pve10 LXC **219** @ `10.0.10.106`).
|
||||
|
||||
---
|
||||
|
||||
## VMs (pve10)
|
||||
|
||||
| VMID | Name | Plan group | Current IP | Target static IP | DHCP/Static | MAC | Notes |
|
||||
|------|------|------------|------------|------------------|-------------|-----|-------|
|
||||
| 100 | homepage-debian | — | — | — | — | — | **Stopped** |
|
||||
| 101 | Jellyfin | media | `10.0.10.232` | `10.0.10.232/24` | ⏳ DHCP? | `BC:24:11:29:B8:84` | **Stopped** (turned off 2026-05-22); inventory `jellyfin` |
|
||||
| 102 | gitea-alpine | — | `10.0.10.169/24` | `10.0.10.169/24` | ⏳ stable DHCP | `BC:24:11:E9:BD:E5` | Pin in-guest or router reservation |
|
||||
| 103 | WRA | — | `10.0.10.154/24` | `10.0.10.154/24` | ⏳ stable DHCP | `BC:24:11:61:DE:7A` | Inventory `n8n`; pin when automating |
|
||||
| 104 | vaultwarden-debian | identity | `10.0.10.142/24` | `10.0.10.142/24` | ⏳ stable DHCP | `BC:24:11:58:DB:DC` | Inventory `vaultwardenVM` |
|
||||
| 105 | TrueNAS | — | `10.0.10.107/24` | `10.0.10.107/24` | ⏳ stable DHCP | `BC:24:11:14:DE:B5` | NAS UI; pool `NAS.SP00` degraded |
|
||||
| 106 | caddy-debian | **edge** | `10.0.10.50/24` | `10.0.10.50/24` → **`.20`** (Phase 1.5) | ✅ **Static** (in-guest) | `BC:24:11:E0:49:B4` | `/etc/network/interfaces` static; Ansible `caddy` |
|
||||
| 107 | mattermost-ubuntu | comms | `10.0.10.107`? | TBD | ⏳ | `BC:24:11:66:6E:01` | Ping `.107` up; confirm not TrueNAS conflict — verify in guest |
|
||||
| 108 | actual-debian | business | `10.0.10.158/24` | `10.0.10.158/24` | ⏳ stable DHCP | `BC:24:11:10:7B:64` | Inventory `actual` |
|
||||
| 109 | portainer-alpine | — | unknown | — | ⏳ | `BC:24:11:0F:40:4F` | Running; retire → Dockge on monitoring LXC |
|
||||
| 150 | pihole00-debian | — | link-local* | TBD | ⏳ | `BC:24:11:86:76:97` | Running |
|
||||
| 117 | hermes | services | `10.0.10.36/24` | `10.0.10.36/24` | ⏳ stable DHCP | `BC:24:11:51:1E:99` | On pve10; guest agent; inventory `hermes` |
|
||||
| 200 | PVE.BU.SVR | labs | `10.0.10.200/24` | `10.0.10.200/24` | ⏳ stable DHCP | `BC:24:11:DA:95:3B` | Running |
|
||||
| 201 | NextcloudAIO-debian | (decommission) | `10.0.10.24/24` | — | 🗑️ **Retiring** | `BC:24:11:14:D4:DE` | Export done; remove Caddy + Kuma monitor, then stop VM |
|
||||
| 300 | pihole-debian | — | — | — | — | — | **Stopped** |
|
||||
|
||||
\* ARP showed IPv6 link-local only at audit time — confirm IPv4 inside guest or install QEMU guest agent.
|
||||
|
||||
---
|
||||
|
||||
## Inventory cross-reference (Ansible `hosts`)
|
||||
|
||||
| Inventory name | IP in hosts | pve10 guest | Match |
|
||||
|----------------|-------------|-------------|-------|
|
||||
| caddy | `10.0.10.50` | VM 106 | ✅ |
|
||||
| cal | `10.0.10.228` | LXC 210 | ✅ |
|
||||
| caseware | `10.0.10.105` | LXC 215 | ✅ |
|
||||
| auto | `10.0.10.59` | LXC 216 | ✅ |
|
||||
| portfolio | `10.0.10.106` | LXC 219 | ✅ |
|
||||
| identity | `10.0.10.21` | LXC 217 | ✅ |
|
||||
| monitoring | `10.0.10.22` | LXC 218 | ✅ |
|
||||
| vaultwardenVM | `10.0.10.142` | VM 104 | ✅ |
|
||||
| giteaVM | `10.0.10.169` | VM 102 | ✅ |
|
||||
| n8n | `10.0.10.154` | VM 103? | ⚠️ verify (WRA vs n8n) |
|
||||
| listmonk | `10.0.10.148` | — | On **pve201** (`[comms]`) |
|
||||
| mailcow | `10.0.10.132` | pve201 VM 106 | ✅ `[comms]` |
|
||||
| hermes | `10.0.10.36` | VM 117 | ✅ on pve10 |
|
||||
| jellyfin | `10.0.10.232` | VM 101 | ✅ (stopped until NAS healthy) |
|
||||
| nextcloud | `10.0.10.24` | VM 201 | commented out (retiring) |
|
||||
| portainerVM | — | VM 109 | removed (Dockge on monitoring) |
|
||||
|
||||
---
|
||||
|
||||
## Static IP conversion queue (pve10)
|
||||
|
||||
Priority order (plan-2):
|
||||
|
||||
1. ✅ **LXC 210** — done (`10.0.10.228/24`)
|
||||
2. ✅ **LXC 215, 216** — pinned (`.105`, `.59`)
|
||||
3. ✅ **LXC 217** (identity) — `10.0.10.21/24`, Authentik deployed
|
||||
4. ✅ **VM 106** (caddy) — static in-guest `.50`
|
||||
5. ✅ **LXC 218** (monitoring) — `.22`, Kuma/Dockge/Umami
|
||||
6. **VMs** — use [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md) (router MAC reservations); skip **201** (Nextcloud retire)
|
||||
7. **New:** edge LXC @ **`.20`** (Phase 1.5)
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
# On pve10 (PVENAS)
|
||||
pct set 215 -net0 name=eth0,bridge=vmbr0,ip=10.0.10.105/24,gw=10.0.10.1
|
||||
pct set 216 -net0 name=eth0,bridge=vmbr0,ip=10.0.10.59/24,gw=10.0.10.1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## NAS / storage note
|
||||
|
||||
- ZFS pool **`NAS.SP00`** on this node: **DEGRADED** (disk `W4J0L3PY` failed). See [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md), [nas-sp00-smart-audit-2026-05-21.md](nas-sp00-smart-audit-2026-05-21.md).
|
||||
- VM **201** root disk on NAS — avoid heavy I/O until pool is healthy.
|
||||
|
||||
---
|
||||
|
||||
## Audit checklist
|
||||
|
||||
- [x] `pct list` / `qm list` on pve10
|
||||
- [x] ARP / ping for running guests
|
||||
- [ ] `pct exec` / guest agent for VMs missing IPv4
|
||||
- [x] Initial `host-list.md` created
|
||||
- [x] Pin 215/216 static
|
||||
- [x] Identity LXC 217 @ `.21` (Authentik Phase 1 infra)
|
||||
- [x] Monitoring LXC 218 @ `.22`
|
||||
- [x] Caddy VM 106 static `.50`
|
||||
- [x] LXC backups `backup-20260522` on 217, 218
|
||||
- [ ] Router DHCP reservations for VMs — [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md) (manual in router UI; table ready)
|
||||
- [ ] Retire VM 201 (Nextcloud)
|
||||
- [ ] Re-run after NAS disk replace
|
||||
346
docs/guides/levkin-selfhost-plan-2.md
Normal file
346
docs/guides/levkin-selfhost-plan-2.md
Normal file
@ -0,0 +1,346 @@
|
||||
# Levkin self-hosted stack — plan & decisions
|
||||
|
||||
Reference doc for the Proxmox homelab. Lives alongside the Cursor project that has the Proxmox info.
|
||||
|
||||
**Conventions:**
|
||||
- All groups run inside an LXC unless marked **VM**.
|
||||
- Inside each LXC: one `docker-compose.yml`, managed by **Dockge** where applicable.
|
||||
- Caddy on the `edge` LXC is the only thing exposed to the internet.
|
||||
- Authentik on the `identity` LXC is the source of truth for who you are.
|
||||
- Vaultwarden stays standalone (it's the break-glass path if Authentik dies).
|
||||
|
||||
---
|
||||
|
||||
## Current state (May 2026)
|
||||
|
||||
**Already running:**
|
||||
- Caddy reverse proxy — currently on a **VM** (should migrate to LXC, see "Caddy migration" section)
|
||||
- Mailcow — VM, mail domain is `levkine.ca` (with e)
|
||||
- Vaultwarden, Vikunja, n8n, Listmonk, Mattermost, Nextcloud — across various LXCs
|
||||
- **Cal.com** — LXC id `210`, `cal.levkin.ca`, Postgres included, admin user `ilia`, 15-min consult event live at `cal.levkin.ca/ilia/consult` with Jitsi link
|
||||
- Caddy entries live for: `caseware.levkin.ca`, `auto.levkin.ca`, `iliadobkin.com`, `cal.levkin.ca`, `listmonk.levkin.ca`, `pdf.levkin.ca`, `search.levkin.ca`, `auth.levkin.ca`
|
||||
- **Authentik** — LXC **217** @ `10.0.10.21`, `https://auth.levkin.ca`, admin + TOTP enrolled
|
||||
- **Monitoring** — LXC **218** @ `10.0.10.22`: Uptime Kuma `:3001`, Dockge `:5001`, Umami `:3000` (LAN-only) — [monitoring-stack.md](monitoring-stack.md)
|
||||
- **Umami** + **Authentik** admin/TOTP/backup codes — done
|
||||
- **Uptime Kuma** — monitors live; email alerts via Mailcow — see [monitoring-stack.md](monitoring-stack.md)
|
||||
- **Dockge** on 218 — manages local `/opt/monitoring` stack
|
||||
- **Snapshots** `backup-20260522` on LXCs **217**, **218**
|
||||
- **Jellyfin** (VM 101) — stopped
|
||||
- LXC **210, 215–218, 219** — static via `pct set`; **Caddy VM 106** — static in-guest `.50`
|
||||
- **Nextcloud VM 201** — export done; **retire soon** (no SSO, remove Kuma monitor + Caddy block when off)
|
||||
|
||||
**Decisions locked in:**
|
||||
- Container manager: **Dockge** (not Portainer, not Coolify/Dokploy/CapRover)
|
||||
- Chat: **Mattermost only** — no Matrix/Synapse
|
||||
- Knowledge tool: **Outline** for client-facing, **SiYuan** if/when PhD work picks up (don't run Affine + Trilium too)
|
||||
- Bookmark manager: **Linkwarden** (full-page archive is the killer feature)
|
||||
- Authentik is the SSO target; Vaultwarden stays standalone
|
||||
|
||||
---
|
||||
|
||||
## LXC / VM grouping table
|
||||
|
||||
| Group | What's inside | Why grouped | LXC or VM |
|
||||
|---|---|---|---|
|
||||
| **edge** | Caddy reverse proxy, Crowdsec/Fail2ban | The front door — small, stable, restart rarely | LXC, 1 vCPU, 1GB RAM |
|
||||
| **identity** | Authentik (+ Postgres + Redis), Vaultwarden | Auth-critical — touch rarely, back up religiously | LXC, 2 vCPU, 2GB RAM |
|
||||
| **comms** | Mailcow | Mailcow's compose is huge (15+ containers) and self-contained — wants its own host | **VM**, 4GB RAM |
|
||||
| **automation** | n8n, Windmill (later), Huginn (later) | Active workloads, frequent updates, you'll touch these a lot | LXC, 2–4 vCPU, 4GB RAM |
|
||||
| **productivity** | Vikunja, Listmonk, Outline, Mealie, Linkwarden | Personal/team productivity, low-resource | LXC, 2 vCPU, 4GB RAM |
|
||||
| **media** | Immich, Nextcloud, Paperless-ngx | Large storage, GPU passthrough useful for Immich ML | **VM** if GPU passthrough, else LXC. Lots of disk. |
|
||||
| **business** | Cal.com ✅, Crater | Client-facing, financial — back up often | LXC, 2 vCPU, 2GB RAM |
|
||||
| **monitoring** | Uptime Kuma ✅, Dockge ✅, Umami ✅, Beszel (later) | Ops stack on LXC **218** | LXC, 2 vCPU, 2GB RAM |
|
||||
| **labs** | Anything experimental — Flowise, Trigger.dev | Things you're trying out, can be wiped | LXC, scratch space |
|
||||
|
||||
### Why this grouping (cheat sheet)
|
||||
|
||||
- One service goes bad → only its group restarts.
|
||||
- Need a kernel upgrade for one stack → snapshot the LXC, upgrade, roll back if broken.
|
||||
- Mailcow's huge surface area is isolated in its own VM.
|
||||
- Edge LXC is tiny and stable → perfect for the layer everything depends on.
|
||||
- Backup cadence per group (see Backups section).
|
||||
- Resource limits per LXC mean a runaway container can't eat n8n's RAM.
|
||||
|
||||
---
|
||||
|
||||
## Subdomains
|
||||
|
||||
Only expose what actually needs to be public. Internal services use Tailscale/Wireguard for remote access.
|
||||
|
||||
### Expose publicly
|
||||
|
||||
| Subdomain | Service | Group | Why public | Status |
|
||||
|---|---|---|---|---|
|
||||
| `caseware.levkin.ca` | Static site | edge | Marketing | ✅ live |
|
||||
| `auto.levkin.ca` | Static site | edge | Marketing | ✅ live |
|
||||
| `iliadobkin.com` | Portfolio (SDET) | edge | Personal site | ✅ live (pve10 LXC 219) |
|
||||
| `cal.levkin.ca` | Cal.com | business | Clients book on it | ✅ live |
|
||||
| `listmonk.levkin.ca` | Listmonk | productivity | Unsubscribe URLs must resolve | ✅ live |
|
||||
| `mail.levkine.ca` | Mailcow | comms | Mail server | ✅ live |
|
||||
| `auth.levkin.ca` | Authentik | identity | OIDC redirect URLs need external resolution | ✅ live |
|
||||
| `bill.levkin.ca` | Crater | business | Clients view invoices | ⏳ Phase 6 |
|
||||
| `cloud.levkin.ca` | Nextcloud | media | **Retiring** — decommission VM 201 after cutover | 🗑️ |
|
||||
| `photos.levkin.ca` | Immich | media | Mobile apps need public hostname | ⏳ Phase 5 |
|
||||
| `vault.levkin.ca` | Vaultwarden | identity | Mobile clients need public hostname | ⏳ |
|
||||
| `notes.levkin.ca` | Outline | productivity | Sharing docs with clients | ⏳ |
|
||||
| `chat.levkin.ca` | Mattermost | comms | Only if inviting outside users | ⏳ optional |
|
||||
|
||||
### Keep internal only (no public DNS, no Caddy block)
|
||||
|
||||
Reachable only via local network or Tailscale/Wireguard:
|
||||
|
||||
| Service | Reason |
|
||||
|---|---|
|
||||
| Umami admin UI | Only you need the dashboard. Tracking endpoint can be public, dashboard isn't. |
|
||||
| Uptime Kuma | Status dashboard is for you. Don't advertise infrastructure. |
|
||||
| Beszel | Metrics are admin-only. |
|
||||
| Dockge | Admin UI — local only. |
|
||||
| n8n editor | UI shouldn't be exposed. Webhooks go on `hooks.levkin.ca` if needed. |
|
||||
| Huginn / Windmill / Flowise | Admin tools. |
|
||||
| Vikunja | Personal task manager. |
|
||||
| Mealie | Family recipes. |
|
||||
| Trigger.dev | Internal automation. |
|
||||
| Paperless-ngx | Personal documents. Never expose. |
|
||||
| SiYuan | Personal knowledge. |
|
||||
| Linkwarden | Personal bookmarks. |
|
||||
|
||||
### Borderline (decide per service)
|
||||
|
||||
| Subdomain | Service | Notes |
|
||||
|---|---|---|
|
||||
| `stats.levkin.ca` | Umami collector | Only the tracking script endpoint needs to be public; admin UI stays internal |
|
||||
| `status.levkin.ca` | Uptime Kuma | Kuma supports a separate public status page URL — that one can be public |
|
||||
|
||||
---
|
||||
|
||||
## Phased rollout
|
||||
|
||||
### Phase 0 — Foundation
|
||||
1. ✅ Caddy running (on VM — migrate to LXC in Phase 1.5)
|
||||
2. ✅ **Static IP audit (partial)** — all LXCs on pve10 pinned; Caddy VM static `.50`; remaining VMs on stable DHCP — see [host-list.md](host-list.md)
|
||||
3. ✅ DNS for `auth.levkin.ca` → home IP (verified 2026-05-22)
|
||||
4. ✅ `identity` LXC **217** @ `10.0.10.21` (2 vCPU, 2GB RAM, 20GB `local-lvm`, Debian 12 + Docker Compose)
|
||||
|
||||
### Phase 1 — Identity ✅
|
||||
1. ✅ Deploy Authentik in `identity` LXC (Authentik + Postgres + Redis, official compose at `/opt/authentik`)
|
||||
2. ✅ Caddy: `auth.levkin.ca` → `10.0.10.21:9000` (simple passthrough, no forward-auth)
|
||||
3. ✅ Admin user (`admin`), TOTP enrolled
|
||||
4. ✅ `authentik Admins` group (skip custom `users` group until more accounts)
|
||||
5. ✅ Static backup codes; **don't OIDC other apps until Cal.com test**
|
||||
|
||||
### Phase 1.5 — Caddy migration to LXC (~30 min)
|
||||
|
||||
Why now (after Phase 1, before bulk SSO work in Phase 4): Authentik is stable enough to absorb a small change, but you haven't yet built the dependency web of OIDC integrations that would make a Caddy reload risky.
|
||||
|
||||
Why Caddy belongs in an LXC, not a VM:
|
||||
- ~50MB OS overhead vs ~512MB for a VM
|
||||
- Boot/restart in 2-5s vs 20-40s (matters when reloading config)
|
||||
- Snapshot/backup is faster
|
||||
- Caddy is a Go binary doing reverse-proxy work — no need for kernel isolation
|
||||
- Near-native network performance
|
||||
|
||||
Steps:
|
||||
1. Create `edge` LXC: Debian 12, 1 vCPU, 512MB RAM, 8GB disk, **static IP from host list**
|
||||
2. Install Caddy via official Debian repo:
|
||||
```bash
|
||||
apt install -y debian-keyring debian-archive-keyring apt-transport-https
|
||||
curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg
|
||||
curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | tee /etc/apt/sources.list.d/caddy-stable.list
|
||||
apt update && apt install caddy
|
||||
```
|
||||
3. Copy `Caddyfile` + custom snippets (`(security-headers)` etc.) from the VM
|
||||
4. Add a **test subdomain** (e.g. `test.levkin.ca`) pointing at the new LXC — verify TLS issues and routing works
|
||||
5. Cut over: update router port-forward (80/443) to the new LXC IP. DNS A records don't need to change if they point to your home IP.
|
||||
6. Watch Mailcow, Cal.com, Listmonk, the marketing sites for ~24h
|
||||
7. Keep the old VM snapshot for a week, then delete
|
||||
|
||||
### Phase 2 — Quick wins ✅
|
||||
1. ✅ **Umami** — tracking on caseware, auto, and iliadobkin.com (portfolio)
|
||||
2. ✅ **Uptime Kuma** — monitors in UI
|
||||
3. ✅ **Dockge** — logged in; register `/opt/monitoring` stack (see [monitoring-stack.md](monitoring-stack.md))
|
||||
4. ⏳ **Kuma email alerts** — SMTP via Mailcow `alerts@levkine.ca` → your inbox (steps in monitoring-stack.md)
|
||||
|
||||
### Phase 3 — Cal.com (mostly done) ✅
|
||||
1. ✅ Cal.com deployed in `business` LXC (id 210, Postgres included)
|
||||
2. ✅ `cal.levkin.ca` proxied via Caddy
|
||||
3. ✅ Booking link live at `cal.levkin.ca/ilia/consult` with Jitsi location
|
||||
4. ✅ Email working via `cal@levkine.ca` SMTP through Mailcow
|
||||
5. ⏳ **Wire Cal.com to Authentik via OIDC** (first real SSO connection — do this after Phase 1)
|
||||
6. ⏳ Update `auto.levkin.ca` button → `cal.levkin.ca/ilia/consult` (currently points to placeholder)
|
||||
|
||||
### Phase 4 — SSO migration (~half a day, staged)
|
||||
Wire each to Authentik, least-risky first:
|
||||
1. **Vikunja** (OIDC native) — easy, single-user impact
|
||||
2. ~~**Nextcloud**~~ — **skipped** (VM 201 retiring)
|
||||
3. **Listmonk** (OIDC native, admin only) — easy
|
||||
4. **Mattermost** (SAML or OIDC native) — moderate
|
||||
5. **Mailcow** (OIDC) — last, because mail-critical
|
||||
|
||||
For each: keep a local admin password as a break-glass account.
|
||||
|
||||
### Phase 5 — Family / personal wins (~1 evening)
|
||||
1. **Immich** in `media` VM — install mobile apps for you and family, enable auto-upload. Face recognition runs in background; "my kids 2024" works within a couple days.
|
||||
2. Skip PhotoPrism — Immich covers it.
|
||||
|
||||
### Phase 6 — Business / consulting (~1–2 evenings)
|
||||
1. **Crater** in `business` LXC — tax rates, company info, Stripe integration if you want online payment
|
||||
2. **Beszel** hub in `monitoring` LXC + agents on each LXC — one dashboard for resource usage
|
||||
|
||||
### Phase 7 — Automation depth (ongoing)
|
||||
Only when you have a real use case:
|
||||
1. **Huginn** in `automation` — first agent: competitor pages, kosher product availability, grant deadlines
|
||||
2. **Windmill** in `automation` — first script: rewrite an n8n flow with too many code nodes
|
||||
3. **Flowise** in `labs` — first flow: chat-with-docs against your consulting notes
|
||||
|
||||
### Phase 8 — Knowledge / research
|
||||
1. **Outline** in `productivity` LXC — client-facing wiki + your notes
|
||||
2. **Linkwarden** in `productivity` LXC — bookmarks with full-page archive
|
||||
3. **Paperless-ngx** in `media` — scan and OCR the paper that's accumulating
|
||||
4. **SiYuan** — only if/when PhD or long-form research becomes relevant
|
||||
|
||||
---
|
||||
|
||||
## Static IP audit
|
||||
|
||||
**Maintain a `host-list.md` file** (in this Cursor project, alongside this plan) with every LXC/VM, its current IP, its target static IP, and DHCP/static status. Cursor will use this as the source of truth when scripting changes.
|
||||
|
||||
Suggested format:
|
||||
|
||||
| LXC/VM ID | Name | Role | Current IP | Target static IP | DHCP/Static | Notes |
|
||||
|---|---|---|---|---|---|---|
|
||||
| 210 | cal | Cal.com | 10.0.10.228/24 (DHCP) | 10.0.10.228/24 | ⏳ static | Convert ASAP |
|
||||
| ... | ... | ... | ... | ... | ... | ... |
|
||||
|
||||
### Recommended IP plan
|
||||
|
||||
Use `/24` subnets within `10.0.10.0/24` (or whatever your LAN is) with role-based ranges so it's scannable:
|
||||
|
||||
| Range | Reserved for |
|
||||
|---|---|
|
||||
| `.1 - .9` | Network gear (router, switches, APs) |
|
||||
| `.10 - .19` | Proxmox host(s) + PBS |
|
||||
| `.20 - .39` | Edge / identity / comms (critical infra) |
|
||||
| `.40 - .79` | Application LXCs (productivity, automation, business, monitoring) |
|
||||
| `.80 - .99` | Media VM(s) |
|
||||
| `.100 - .199` | DHCP pool (clients, phones, laptops) |
|
||||
| `.200 - .249` | Labs / experimental |
|
||||
| `.250 - .254` | Reserved |
|
||||
|
||||
### How to set static on a Proxmox LXC
|
||||
|
||||
Two methods — pick one and stick with it:
|
||||
|
||||
**Method A — Proxmox CLI (recommended, survives reboots cleanly):**
|
||||
```bash
|
||||
pct set <ID> -net0 name=eth0,bridge=vmbr0,ip=10.0.10.X/24,gw=10.0.10.1
|
||||
pct reboot <ID>
|
||||
```
|
||||
|
||||
**Method B — Router DHCP reservation:**
|
||||
- Reserve the IP in your router's DHCP table by MAC address. LXC stays "DHCP" technically, but always gets the same IP.
|
||||
- Easier if you have many hosts and one router.
|
||||
- Risk: if the LXC's MAC changes (rebuild from snapshot to new ID), reservation breaks.
|
||||
|
||||
**Recommendation:** Method A (`pct set`) for everything critical (edge, identity, comms, business). Method B is fine for labs/experimental LXCs.
|
||||
|
||||
### Audit checklist
|
||||
|
||||
1. List every LXC: `pct list`
|
||||
2. List every VM: `qm list`
|
||||
3. For each, run `pct exec <ID> -- ip a` (or `qm guest exec <ID> -- ip a` for VMs) and check whether the IP came from DHCP
|
||||
4. Fill in `host-list.md`
|
||||
5. Pick target IPs from the range plan above
|
||||
6. Convert one at a time, lowest-risk first (labs → productivity → business → comms → identity → edge)
|
||||
7. **After each conversion**, verify the Caddy reverse-proxy entry still works (curl from outside)
|
||||
8. Update `host-list.md` status column
|
||||
|
||||
### Hosts known to need conversion right now
|
||||
|
||||
- **LXC 210 (cal)** — currently DHCP `10.0.10.228/24`, must be static before Caddy migration
|
||||
|
||||
---
|
||||
|
||||
## Backlog (priority order)
|
||||
|
||||
### P0 — next batch after Phase 1 admin bootstrap
|
||||
1. **Umami** — analytics on landing pages, 10 min to deploy, immediate signal
|
||||
2. **Uptime Kuma** — monitor what you already have
|
||||
3. **Dockge** — UI over existing compose
|
||||
4. **Beszel** — homelab resource visibility
|
||||
5. **Mealie** — family recipes, simple win
|
||||
|
||||
### P1 — when ready
|
||||
- **Outline** — wiki for client docs
|
||||
- **Linkwarden** — bookmarks with full-page archive
|
||||
- **Plane** — Jira-lite project management (pair with Mattermost)
|
||||
|
||||
### P2 — when you have a real need
|
||||
- **Crater** — invoicing (Phase 6)
|
||||
- **Immich** — photos (Phase 5)
|
||||
- **Paperless-ngx** — document scanning (Phase 8)
|
||||
- **Huginn** — first when you have a monitoring use case
|
||||
- **Windmill** — when n8n hits limits
|
||||
- **Trigger.dev** — durable background jobs in code (better fit than Windmill for QA work)
|
||||
- **PrivateBin** — encrypted paste for sharing secrets with contractors
|
||||
- **Addy.io** — email aliases
|
||||
- **SiYuan** — if PhD work picks up
|
||||
- **Flowise** — labs only, when LLM workflow use case appears
|
||||
|
||||
### Skip / declined
|
||||
- ~~PhotoPrism~~ — Immich covers it
|
||||
- ~~Activepieces~~ — you already have n8n
|
||||
- ~~Affine / Trilium~~ — picked Outline + SiYuan instead
|
||||
- ~~Matrix/Synapse + Element~~ — staying on Mattermost
|
||||
- ~~Coolify / Dokploy / CapRover~~ — Dockge is enough; revisit only if writing many custom apps
|
||||
|
||||
---
|
||||
|
||||
## Backup strategy
|
||||
|
||||
- **Proxmox Backup Server (PBS)** or `vzdump` to a NAS — snapshot each LXC/VM nightly
|
||||
- **Critical groups** (`identity`, `comms`, `business`): 7 daily + 4 weekly + 12 monthly
|
||||
- **Productivity/automation**: 7 daily + 4 weekly
|
||||
- **Labs**: 3 daily, no long retention
|
||||
- **Off-site copy** of `identity` and `business` LXCs — these contain auth and billing data. Encrypted copy to Wasabi or Backblaze B2.
|
||||
|
||||
The whole LXC gets snapshotted — much simpler than file-level container backup.
|
||||
|
||||
**Done on pve10 (2026-05-22):** `pct snapshot` **`backup-20260522`** on LXCs **217** (identity) and **218** (monitoring).
|
||||
|
||||
---
|
||||
|
||||
## Next steps (priority order)
|
||||
|
||||
See **[homelab-status-2026-05-22.md](homelab-status-2026-05-22.md)** for done vs todo.
|
||||
|
||||
| # | Task | Effort | Doc |
|
||||
|---|------|--------|-----|
|
||||
| 1 | **Kuma SMTP** test in UI | 5 min | [monitoring-stack.md](monitoring-stack.md) |
|
||||
| 2 | **UniFi DHCP reservations** | 20 min | [unifi-static-dhcp.md](unifi-static-dhcp.md) |
|
||||
| 3 | **Cal.com → Authentik OIDC** | 1–2 h | Phase 3 below |
|
||||
| 4 | **Retire Nextcloud VM 201** | 30 min | [nextcloud-export-2026-05-21.md](nextcloud-export-2026-05-21.md) |
|
||||
| 5 | **NAS.SP00** disk replace → Jellyfin | hardware | [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md) |
|
||||
| 6 | **Caddy → edge LXC `.20`** | ~30 min | Phase 1.5 |
|
||||
|
||||
**Defer:** Nextcloud SSO, Immich, Crater, Beszel until above are done.
|
||||
|
||||
### Nextcloud decommission (VM 201)
|
||||
|
||||
1. Confirm export in `exports/nextcloud-2026-05-21/` is complete
|
||||
2. Delete **Nextcloud** monitor in Kuma
|
||||
3. Remove `nextcloud.levkin.ca` from Caddy VM
|
||||
4. Stop VM 201; update [host-list.md](host-list.md)
|
||||
5. After NAS healthy: optional `vzdump` archive then delete disk
|
||||
|
||||
---
|
||||
|
||||
## Important rules
|
||||
|
||||
1. **Never put Authentik behind itself.** `auth.levkin.ca` is a simple Caddy passthrough — no forward-auth, no fancy dependencies. If Authentik goes down, you'd lose access to Authentik.
|
||||
2. **Vaultwarden stays standalone.** It's your break-glass path if Authentik dies. Don't OIDC it.
|
||||
3. **Keep a local admin password on every SSO-wired app.** OIDC integrations break during upgrades — you need to log in to fix them.
|
||||
4. **Local admin to Proxmox host.** Independent of Authentik and Vaultwarden. Written down somewhere physical.
|
||||
5. **Don't expose admin UIs publicly.** Dockge, Beszel, Uptime Kuma admin, n8n editor — use Tailscale or Wireguard for remote access.
|
||||
6. **Static IPs for every LXC.** DHCP will eventually move them and Caddy will break. Set via `pct set <id> -net0 ...ip=10.0.10.X/24,gw=...` or a router reservation.
|
||||
7. **Cal.com LXC (210)** — static at `.228` ✅.
|
||||
8. **Maintain `host-list.md`** as the single source of truth for IPs. Update it whenever a new LXC/VM is created or migrated.
|
||||
232
docs/guides/monitoring-stack.md
Normal file
232
docs/guides/monitoring-stack.md
Normal file
@ -0,0 +1,232 @@
|
||||
# Monitoring stack (LXC 218)
|
||||
|
||||
**Host:** `monitoring` @ `10.0.10.22` (PVENAS pve10, VMID **218**)
|
||||
**Compose:** `/opt/monitoring/compose.yml`
|
||||
**Stacks dir (Dockge):** `/opt/stacks`
|
||||
|
||||
All admin UIs are **LAN-only** (no public Caddy blocks). Use Tailscale or local network.
|
||||
|
||||
| Service | URL | Port | Notes |
|
||||
|---------|-----|------|-------|
|
||||
| **Uptime Kuma** | http://10.0.10.22:3001 | 3001 | Admin + monitors configured ✅ (replaces pve201 LXC **305** @ `.197`, stopped) |
|
||||
| **Dockge** | http://10.0.10.22:5001 | 5001 | Manage compose on **this LXC only** |
|
||||
| **Umami** | http://10.0.10.22:3000 | 3000 | Password changed ✅; caseware + auto tracked |
|
||||
|
||||
Secrets: `/opt/monitoring/.env` on the LXC (mode 600). Not in git.
|
||||
|
||||
---
|
||||
|
||||
## Backups (pve10)
|
||||
|
||||
| Guest | VMID | Snapshot | Date |
|
||||
|-------|------|----------|------|
|
||||
| identity | 217 | `backup-20260522` | 2026-05-22 |
|
||||
| monitoring | 218 | `backup-20260522` | 2026-05-22 |
|
||||
|
||||
On pve10:
|
||||
|
||||
```bash
|
||||
pct listsnapshot 217
|
||||
pct listsnapshot 218
|
||||
# Rollback if needed:
|
||||
# pct rollback 217 backup-20260522
|
||||
```
|
||||
|
||||
Optional off-node copy (when NAS healthy): `vzdump 217 218 --storage local --mode snapshot --compress zstd`
|
||||
|
||||
---
|
||||
|
||||
## Uptime Kuma — monitors
|
||||
|
||||
Configured in UI (all green). **Remove** the Nextcloud monitor when VM 201 is retired.
|
||||
|
||||
| Name | URL |
|
||||
|------|-----|
|
||||
| Authentik | https://auth.levkin.ca |
|
||||
| Cal.com | https://cal.levkin.ca |
|
||||
| Caseware / Auto | marketing sites |
|
||||
| Mailcow | https://mail.levkine.ca |
|
||||
| Listmonk, Gitea, Vault, Todo, PVE nodes | per your dashboard |
|
||||
|
||||
---
|
||||
|
||||
## Uptime Kuma — email alerts (Mailcow)
|
||||
|
||||
Mail domain is **`levkine.ca`** (with **e**). Cal.com already sends via Mailcow as `cal@levkine.ca`.
|
||||
|
||||
### Which email to use
|
||||
|
||||
| Role | Address | Notes |
|
||||
|------|---------|-------|
|
||||
| **SMTP server** | `mail.levkine.ca` | Mailcow host |
|
||||
| **SMTP port** | `587` | STARTTLS (not 465 unless you prefer SMTPS) |
|
||||
| **From (sender)** | `alerts@levkine.ca` | Create mailbox in Mailcow if it does not exist |
|
||||
| **To (you)** | `idobkin@gmail.com` or `ilia@levkine.ca` | Use whichever you read; Gmail is fine for alerts |
|
||||
|
||||
### 1. Create mailbox in Mailcow (if needed)
|
||||
|
||||
**Automated (needs Mailcow API key):**
|
||||
|
||||
```bash
|
||||
# Define mailbox in group_vars/all/mailcow.yml, password in vault:
|
||||
make mailcow-mailbox MAILBOX=alerts
|
||||
# (alias: make mailcow-create-alerts)
|
||||
|
||||
# Import from .env into vault once, then delete .env:
|
||||
cp .env.example .env # MAILCOW_API_KEY=... ALERTS_PASSWORD=...
|
||||
make vault-import-env
|
||||
make mailcow-mailbox MAILBOX=alerts
|
||||
```
|
||||
|
||||
To add another mailbox tomorrow: edit `mailcow.yml` + `vault_mailcow_mailbox_passwords.<name>`, then `make mailcow-mailbox MAILBOX=<name>`.
|
||||
|
||||
**Manual UI:**
|
||||
|
||||
1. https://mail.levkine.ca → admin login
|
||||
2. **Email → Mailboxes → Add** → `alerts@levkine.ca` (strong password → store in Vaultwarden)
|
||||
3. Optional: alias `monitoring@levkine.ca` → same inbox
|
||||
|
||||
### 2. Add notification in Kuma
|
||||
|
||||
**Automated (from your Mac, after mailbox exists):**
|
||||
|
||||
```bash
|
||||
cd /path/to/ansible
|
||||
pip install uptime-kuma-api # or: .venv/bin/pip install uptime-kuma-api
|
||||
export KUMA_URL=http://10.0.10.22:3001 KUMA_USER=admin KUMA_PASSWORD='...'
|
||||
export SMTP_USER=alerts@levkine.ca SMTP_PASS='...' SMTP_TO=idobkin@gmail.com
|
||||
./scripts/kuma-setup-smtp.sh
|
||||
```
|
||||
|
||||
**Manual UI:**
|
||||
|
||||
1. http://10.0.10.22:3001 → **Settings** → **Notifications** → **Setup Notification**
|
||||
2. Type: **Email (SMTP)**
|
||||
3. Fill in:
|
||||
|
||||
| Field | Value |
|
||||
|-------|--------|
|
||||
| SMTP Host | `mail.levkine.ca` |
|
||||
| SMTP Port | `587` |
|
||||
| Security | TLS / STARTTLS |
|
||||
| Username | `alerts@levkine.ca` |
|
||||
| Password | mailbox password |
|
||||
| From Email | `alerts@levkine.ca` |
|
||||
| To Email | `idobkin@gmail.com` (or your `@levkine.ca`) |
|
||||
|
||||
4. **Test** → save
|
||||
5. Edit each monitor (or default) → **Notifications** → enable this channel
|
||||
|
||||
**Alternative:** Mattermost webhook (`slack.levkin.ca`) if you prefer chat over email.
|
||||
|
||||
---
|
||||
|
||||
## Dockge — what to do after login
|
||||
|
||||
**On server today:**
|
||||
|
||||
| Path | Contents |
|
||||
|------|----------|
|
||||
| `/opt/monitoring/compose.yml` | **Live** stack (Docker project `monitoring`, 4 containers running) |
|
||||
| `/opt/stacks/monitoring/compose.yaml` | Copy for Dockge (same services) |
|
||||
| `/opt/stacks/authentik-ref/`, `cal-ref/` | README only — **no** compose file (ignore) |
|
||||
|
||||
**Why “Scan Stacks Folder” looks empty**
|
||||
|
||||
- Scan only picks up folders under **`/opt/stacks`** that contain `compose.yaml` / `compose.yml`.
|
||||
- Your containers were started from **`/opt/monitoring`**, so Docker does not automatically link them to `/opt/stacks/monitoring` until you register that folder in Dockge.
|
||||
|
||||
**Fix (pick one):**
|
||||
|
||||
### Dockge UI note (your version)
|
||||
|
||||
**Settings → General** only has hostname — there is **no “Stacks directory” field**. That path is fixed at deploy time:
|
||||
|
||||
`DOCKGE_STACKS_DIR=/opt/stacks` (already set in `/opt/monitoring/compose.yml`).
|
||||
|
||||
Stacks are managed from the **home / dashboard** page, not Settings.
|
||||
|
||||
### Option 1 — Add stack manually (recommended)
|
||||
|
||||
1. http://10.0.10.22:5001 → **home** (logo / dashboard, not Settings)
|
||||
2. **+ Create Stack** (or **Compose** → new stack)
|
||||
3. Name: `monitoring`
|
||||
4. Path: `/opt/stacks/monitoring` (must contain `compose.yaml`)
|
||||
5. Open stack → review compose → **do not Start** until old project is stopped (below)
|
||||
|
||||
### Option 2 — Scan from dashboard menu
|
||||
|
||||
1. Stay on **dashboard** (not Settings)
|
||||
2. Top-right **⋮** → **Scan Stacks Folder**
|
||||
3. Pick **`monitoring`** if it appears (`authentik-ref` / `cal-ref` have no compose — ignore)
|
||||
|
||||
**Avoid duplicate containers**
|
||||
|
||||
Before starting from Dockge:
|
||||
|
||||
```bash
|
||||
ssh root@10.0.10.22
|
||||
cd /opt/monitoring && docker compose down
|
||||
# Then start from Dockge UI on stack monitoring, OR:
|
||||
cd /opt/stacks/monitoring && docker compose --env-file .env up -d
|
||||
```
|
||||
|
||||
Until you do that, Kuma/Dockge/Umami keep running from `/opt/monitoring`; Dockge is optional for edits until cutover.
|
||||
|
||||
### Optional reference stacks (read-only)
|
||||
|
||||
Create empty stacks under `/opt/stacks/` only if you want a UI placeholder:
|
||||
|
||||
```bash
|
||||
ssh root@10.0.10.22
|
||||
mkdir -p /opt/stacks/authentik /opt/stacks/cal
|
||||
# Copy compose for reference (does NOT control remote host):
|
||||
scp root@10.0.10.21:/opt/authentik/compose.yml /opt/stacks/authentik/
|
||||
```
|
||||
|
||||
To **manage** Authentik or Cal from Dockge long term, either move compose to 218 (not recommended) or install Dockge on each LXC later.
|
||||
|
||||
### Step 3 — Retire Portainer
|
||||
|
||||
When comfortable: stop VM **109** (portainer) on pve10; use Dockge on 218 instead.
|
||||
|
||||
---
|
||||
|
||||
## Umami
|
||||
|
||||
- ✅ Running at http://10.0.10.22:3000 (LAN / Tailscale only)
|
||||
- ✅ **Public tracking** via `https://stats.levkin.ca/script.js` on caseware, auto, and **iliadobkin.com** (portfolio LXC 219)
|
||||
|
||||
**Three choices (pick one later; none block the sites):**
|
||||
|
||||
| Option | Effort | Notes |
|
||||
|--------|--------|--------|
|
||||
| **A — Skip public analytics** | 0 | Use Umami dashboard on `:3000` when you care; no DNS/Caddy |
|
||||
| **B — One DNS + Caddy block** | ~10 min | A record → home IP + Caddy `reverse_proxy 10.0.10.22:3000` on caddy VM |
|
||||
| **C — Re-add script tags** | 2 min | After B works, insert script before `</head>` on 215/216 |
|
||||
|
||||
**Suggested public hostname (instead of `analytics`):** `stats.levkin.ca` (short, clear). Alternatives: `umami.levkin.ca`, `metrics.levkin.ca`.
|
||||
|
||||
```caddy
|
||||
stats.levkin.ca {
|
||||
import security-headers
|
||||
encode gzip
|
||||
reverse_proxy 10.0.10.22:3000
|
||||
}
|
||||
```
|
||||
|
||||
Script tag then: `https://stats.levkin.ca/script.js`
|
||||
|
||||
We are **not stuck** — marketing sites do not need Umami to render. Option A is fine for now.
|
||||
|
||||
---
|
||||
|
||||
## Maintenance
|
||||
|
||||
```bash
|
||||
ssh root@10.0.10.22
|
||||
cd /opt/monitoring
|
||||
docker compose --env-file .env pull
|
||||
docker compose --env-file .env up -d
|
||||
docker compose ps
|
||||
```
|
||||
203
docs/guides/nas-sp00-drive-failure-report.md
Normal file
203
docs/guides/nas-sp00-drive-failure-report.md
Normal file
@ -0,0 +1,203 @@
|
||||
# NAS.SP00 drive failure — IT report
|
||||
|
||||
**Date:** 2026-05-21
|
||||
**Host:** PVENAS (Proxmox VE) — `10.0.10.10`
|
||||
**Pool:** ZFS `NAS.SP00` (~9 TB, ~862 GB used)
|
||||
**Prepared for:** IT / hardware replacement
|
||||
**SMART audit:** [nas-sp00-smart-audit-2026-05-21.md](nas-sp00-smart-audit-2026-05-21.md)
|
||||
|
||||
---
|
||||
|
||||
## Executive summary
|
||||
|
||||
One disk in a four-drive ZFS mirror pair has **failed at the hardware level**. The pool is **DEGRADED** but **online** with **no known data errors** at this time. The failed drive must be **physically replaced** and the pool **resilvered**. Until then, **mirror-0 has no redundancy** — a second failure on the remaining disk in that mirror (`W4J0L0BA`) could cause data loss.
|
||||
|
||||
This issue also caused a **host-wide I/O wedge** (pool SUSPENDED → stuck `sync()`), which blocked LXC/VM operations unrelated to the pool (e.g. Cal.com on `local-lvm`). That was cleared by a forced node reboot; **replacing the drive remains required**.
|
||||
|
||||
---
|
||||
|
||||
## Pool layout
|
||||
|
||||
| Vdev | Role | Disk A | Disk B | Status |
|
||||
|------|------|--------|--------|--------|
|
||||
| mirror-0 | RAID1 pair | `W4J0L0BA` (sda, 5 TB) | `W4J0L3PY` (sdb) | **DEGRADED** — sdb UNAVAIL |
|
||||
| mirror-1 | RAID1 pair | `W4J0LKCD` (sdd, 5 TB) | `W4J0K9V7` (sdc, 5 TB) | **ONLINE** |
|
||||
|
||||
Model family (healthy drives): Seagate **ST5000DM000-1FK178** (5 TB, 7200 RPM).
|
||||
|
||||
---
|
||||
|
||||
## Failed drive identification
|
||||
|
||||
| Field | Expected | Observed |
|
||||
|-------|----------|----------|
|
||||
| **Serial** | W4J0L3PY | W4J0L3PY |
|
||||
| **Model** | ST5000DM000-1FK178 | ST5000DM000 (truncated reporting) |
|
||||
| **WWN** | — | `5000c50082cc8bbb` |
|
||||
| **Firmware** | — | CC48 |
|
||||
| **Capacity** | ~5,000,981,078,016 bytes (**5.00 TB**) | **137,438,952,960 bytes (~137 GB)** |
|
||||
| **Linux device** | `/dev/sdb` | `/dev/sdb` |
|
||||
| **ZFS state** | ONLINE | **UNAVAIL** — label missing/invalid |
|
||||
|
||||
ZFS last known path:
|
||||
`/dev/disk/by-id/ata-ST5000DM000-1FK178_W4J0L3PY-part1`
|
||||
|
||||
---
|
||||
|
||||
## Symptoms and evidence
|
||||
|
||||
### 1. Capacity collapse (primary indicator)
|
||||
|
||||
The drive is detected as **~137 GB** instead of **5 TB**. ZFS cannot use a partition label created for a 5 TB disk on a device that exposes only a tiny fraction of capacity. This pattern is typical of:
|
||||
|
||||
- **Failed HDD** (media/controller failure)
|
||||
- **Bad SATA cable, backplane port, or HBA port**
|
||||
- **USB/SATA bridge failure** (if applicable)
|
||||
- **Severe firmware/HPA corruption** (less common)
|
||||
|
||||
### 2. SMART / SCSI errors
|
||||
|
||||
`smartctl` against `/dev/sdb`:
|
||||
|
||||
- **Read SMART Data failed:** scsi error aborted command
|
||||
- **Overall health:** UNKNOWN (attributes unreadable)
|
||||
- Multiple log read commands fail (Error Log, Self-test Log, GP Log, etc.)
|
||||
|
||||
Healthy sibling in same mirror (`/dev/sda`, W4J0L0BA): **SMART PASSED**, full 5 TB capacity.
|
||||
|
||||
### 3. Kernel log (`dmesg` at boot, 2026-05-21 ~21:27)
|
||||
|
||||
Repeated on **`sdb`**:
|
||||
|
||||
```
|
||||
Buffer I/O error on dev sdb
|
||||
Sense Key: Medium Error
|
||||
Add. Sense: Unrecovered read error
|
||||
critical medium error, dev sdb, sector N op 0x0:(READ)
|
||||
```
|
||||
|
||||
Indicates the block device cannot reliably read media — **hardware or link layer**, not a ZFS configuration issue.
|
||||
|
||||
### 4. ZFS pool history
|
||||
|
||||
- Pool previously entered **SUSPENDED** state (I/O failures on faulted devices).
|
||||
- After node reboot: pool **DEGRADED**, short **resilver** completed with **0 errors** (healing scan on remaining devices).
|
||||
- Current: **No known data errors** in `zpool status`.
|
||||
|
||||
---
|
||||
|
||||
## Impact
|
||||
|
||||
### Storage / services on `NAS.SP00`
|
||||
|
||||
Proxmox guests with disks on this pool (non-exhaustive):
|
||||
|
||||
| VMID | Name | NAS-backed storage |
|
||||
|------|------|-------------------|
|
||||
| 101 | Jellyfin | 1 TB zvol |
|
||||
| 105 | TrueNAS | 1 TB zvol |
|
||||
| 108 | actual-debian | 10 GB |
|
||||
| 200 | PVE.BU.SVR | 1 TB |
|
||||
| 201 | NextcloudAIO-debian | 8 TB |
|
||||
|
||||
**Risk:** With mirror-0 degraded, blocks stored only on the surviving mirror-0 disk have **no redundancy** until the failed drive is replaced and resilver completes.
|
||||
|
||||
### Unrelated workloads
|
||||
|
||||
Guests on **`local-lvm`** (NVMe, e.g. Cal.com LXC 210, Caddy VM 106) are **not stored on NAS.SP00** but were affected when the pool suspended and blocked system-wide `sync()`.
|
||||
|
||||
### Backup target
|
||||
|
||||
Proxmox datastore **PVEBUVD00** (PBS @ `10.0.10.200:8007`) reports **unreachable** from this node — separate issue; verify PBS host/network.
|
||||
|
||||
---
|
||||
|
||||
## Diagnosis
|
||||
|
||||
| Question | Answer |
|
||||
|----------|--------|
|
||||
| Is this a ZFS misconfiguration? | **No** — config is consistent; three drives show correct 5 TB labels. |
|
||||
| Is the pool lost? | **No** — degraded but importable; no known data errors currently. |
|
||||
| Which disk to replace? | **Seagate W4J0L3PY** (`/dev/sdb`, mirror-0 failed leg). |
|
||||
| Can we fix it in software? | **Unlikely** — capacity and SMART failures point to hardware. |
|
||||
| Safe to reseat first? | **Optional trial** — power down or hot-swap per chassis policy; if capacity still reads ~137 GB, **replace disk**. |
|
||||
|
||||
---
|
||||
|
||||
## Recommended actions
|
||||
|
||||
### Immediate (IT / on-site)
|
||||
|
||||
1. **Identify physical slot** for serial **W4J0L3PY** (compare to inventory/asset tags).
|
||||
2. **Reseat** SATA/SAS cable and backplane connection once (if hot-swap policy allows). Reboot or rescan SCSI bus.
|
||||
3. If capacity is still wrong or SMART still fails → **replace with new 5 TB+ enterprise/NAS-class HDD** (match class of ST5000DM000 or better).
|
||||
4. Do **not** remove the UNAVAIL device from the pool until replacement is in place.
|
||||
|
||||
### After new disk is installed
|
||||
|
||||
On **PVENAS** as root (adjust `/dev/disk/by-id/...` to the **new** drive’s partition 1):
|
||||
|
||||
```bash
|
||||
# Verify new disk shows ~5 TB
|
||||
lsblk /dev/sdX
|
||||
smartctl -H /dev/sdX
|
||||
|
||||
# Replace failed vdev (use ID from: zpool status NAS.SP00)
|
||||
zpool replace NAS.SP00 ata-ST5000DM000-1FK178_W4J0L3PY-part1 /dev/disk/by-id/ata-NEW_SERIAL-part1
|
||||
|
||||
# Monitor until resilver completes
|
||||
zpool status -v NAS.SP00
|
||||
```
|
||||
|
||||
### Post-resilver
|
||||
|
||||
- Run **`zpool scrub NAS.SP00`** during a maintenance window.
|
||||
- Confirm **PVEBUVD00** / PBS connectivity if backups depend on it.
|
||||
- Review whether **Nextcloud VM 201** (8 TB on degraded pool) should remain running until healthy.
|
||||
|
||||
### Not recommended
|
||||
|
||||
- Ignoring degraded state for extended periods.
|
||||
- Running heavy I/O on large VMs (e.g. 8 TB Nextcloud) during extended degraded operation.
|
||||
- `zpool clear` without addressing hardware — does not fix a dead disk.
|
||||
|
||||
---
|
||||
|
||||
## Reference — healthy disks (for spare matching)
|
||||
|
||||
| Serial | Device | Capacity | SMART |
|
||||
|--------|--------|----------|-------|
|
||||
| W4J0L0BA | sda | 5.00 TB | PASSED |
|
||||
| W4J0K9V7 | sdc | 5.00 TB | PASSED |
|
||||
| W4J0LKCD | sdd | 5.00 TB | PASSED |
|
||||
|
||||
---
|
||||
|
||||
## Timeline (brief)
|
||||
|
||||
| When | Event |
|
||||
|------|--------|
|
||||
| Prior to 2026-05-21 | `W4J0L3PY` accumulated read/write errors; pool faulted |
|
||||
| 2026-05-21 | Pool **SUSPENDED**; host `sync()` wedged; Cal LXC start failed |
|
||||
| 2026-05-21 ~21:28 | Forced node reboot; pool **DEGRADED**, resilver finished, 0 errors |
|
||||
| 2026-05-21 | `sdb` still reports **~137 GB**, UNAVAIL — **replacement still required** |
|
||||
|
||||
---
|
||||
|
||||
## Contact / handoff notes
|
||||
|
||||
- **Node:** Proxmox VE 8.x on **PVENAS** (`10.0.10.10`)
|
||||
- **Pool name in Proxmox:** `NAS.SP00` (zfspool, active, degraded)
|
||||
- **Failed serial:** **W4J0L3PY**
|
||||
- **Replacement type:** 5 TB+ HDD, same or better class as Seagate ST5000DM000-1FK178
|
||||
|
||||
For questions about homelab service impact (Cal, Caddy, Phase 0 rollout), see [`levkin-selfhost-plan-2.md`](levkin-selfhost-plan-2.md).
|
||||
## TL;DR
|
||||
|
||||
- Pool `NAS.SP00` on `PVENAS` (10.0.10.10) had a disk failure (`W4J0L3PY`)
|
||||
- Pool went **SUSPENDED**; required forced reboot and is now **DEGRADED**
|
||||
- **Immediate action:** Replace the failed drive with a spare (same or larger size; see healthy serials in table below)
|
||||
- Use `zpool replace` command with correct device paths (see main procedure)
|
||||
- Monitor resilver to completion; run `zpool scrub` after
|
||||
- Backup services and large VMs (e.g. Nextcloud 8TB) depend on pool health—keep degraded time short
|
||||
- Reach out if unsure about pool status or downstream service risk
|
||||
232
docs/guides/nas-sp00-smart-audit-2026-05-21.md
Normal file
232
docs/guides/nas-sp00-smart-audit-2026-05-21.md
Normal file
@ -0,0 +1,232 @@
|
||||
# NAS.SP00 SMART audit
|
||||
|
||||
**Date:** 2026-05-21
|
||||
**Host:** PVENAS (Proxmox VE) — `10.0.10.10`
|
||||
**Pool:** ZFS `NAS.SP00`
|
||||
**Related:** [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md)
|
||||
|
||||
---
|
||||
|
||||
## Executive summary
|
||||
|
||||
| Serial | Device | Capacity | ZFS (mirror) | SMART health |
|
||||
|--------|--------|----------|--------------|--------------|
|
||||
| W4J0L0BA | sda | 5.00 TB | mirror-0 ONLINE | **PASSED** |
|
||||
| W4J0L3PY | sdb | **137 GB** | mirror-0 UNAVAIL | **UNKNOWN** (read fails) |
|
||||
| W4J0K9V7 | sdc | 5.00 TB | mirror-1 ONLINE | **PASSED** |
|
||||
| W4J0LKCD | sdd | 5.00 TB | mirror-1 ONLINE | **PASSED** |
|
||||
|
||||
Pool state at audit time: **DEGRADED** — failed leg `W4J0L3PY` (`/dev/sdb`). No known data errors. Three healthy drives show no reallocated, pending, or uncorrectable sectors.
|
||||
|
||||
---
|
||||
|
||||
## ZFS pool status
|
||||
|
||||
```
|
||||
pool: NAS.SP00
|
||||
state: DEGRADED
|
||||
status: One or more devices could not be used because the label is missing or
|
||||
invalid. Sufficient replicas exist for the pool to continue
|
||||
functioning in a degraded state.
|
||||
action: Replace the device using 'zpool replace'.
|
||||
scan: resilvered 0B in 00:00:01 with 0 errors on Thu May 21 21:27:54 2026
|
||||
|
||||
NAME STATE READ WRITE CKSUM
|
||||
NAS.SP00 DEGRADED 0 0 0
|
||||
mirror-0 DEGRADED 0 0 0
|
||||
ata-ST5000DM000-1FK178_W4J0L0BA ONLINE 0 0 0
|
||||
11449632222283419591 UNAVAIL 0 0 0 was /dev/disk/by-id/ata-ST5000DM000-1FK178_W4J0L3PY-part1
|
||||
mirror-1 ONLINE 0 0 0
|
||||
ata-ST5000DM000-1FK178_W4J0LKCD ONLINE 0 0 0
|
||||
ata-ST5000DM000-1FK178_W4J0K9V7 ONLINE 0 0 0
|
||||
|
||||
errors: No known data errors
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Block devices (`lsblk`)
|
||||
|
||||
| NAME | SIZE | MODEL | SERIAL | ROTA |
|
||||
|------|------|-------|--------|------|
|
||||
| sda | 4.5T | ST5000DM000-1FK178 | W4J0L0BA | 1 |
|
||||
| sdb | 3.9G | ST5000DM000 | W4J0L3PY | 1 |
|
||||
| sdc | 4.5T | ST5000DM000-1FK178 | W4J0K9V7 | 1 |
|
||||
| sdd | 4.5T | ST5000DM000-1FK178 | W4J0LKCD | 1 |
|
||||
|
||||
---
|
||||
|
||||
## Healthy drives — key metrics
|
||||
|
||||
| Metric | sda (W4J0L0BA) | sdc (W4J0K9V7) | sdd (W4J0LKCD) |
|
||||
|--------|----------------|----------------|----------------|
|
||||
| Model | ST5000DM000-1FK178 | ST5000DM000-1FK178 | ST5000DM000-1FK178 |
|
||||
| Firmware | CC48 | CC48 | CC48 |
|
||||
| WWN | 5000c500082c02f61 | 5000c500082c7e2ce | 5000c500082d84c45 |
|
||||
| Rotation | 5980 rpm | 5980 rpm | 5980 rpm |
|
||||
| SATA | 3.1 @ 6.0 Gb/s | 3.1 @ 6.0 Gb/s | 3.1 @ 6.0 Gb/s |
|
||||
| Power-on hours | 52,481 (~6.0 y) | 53,087 (~6.1 y) | 45,580 (~5.2 y) |
|
||||
| Temperature | 27 °C | 30 °C | 30 °C |
|
||||
| Reallocated sectors | 0 | 0 | 0 |
|
||||
| Current pending sectors | 0 | 0 | 0 |
|
||||
| Offline uncorrectable | 0 | 0 | 0 |
|
||||
| UDMA CRC errors | 0 | 0 | 0 |
|
||||
| Start/stop count | 350 | 367 | 310 |
|
||||
| Load cycle count | 348,974 | 340,961 | 184,891 |
|
||||
| Power cycle count | 345 | 363 | 309 |
|
||||
|
||||
High **Load_Cycle_Count** on Seagate Desktop HDD.15 is common (head parking); not alarming when reallocated/pending counts remain zero.
|
||||
|
||||
---
|
||||
|
||||
## Failed drive — `/dev/sdb` (W4J0L3PY)
|
||||
|
||||
### Identity
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Device Model | ST5000DM000 (truncated; not full -1FK178 suffix) |
|
||||
| Serial | W4J0L3PY |
|
||||
| WWN | 5000c500082cc8bbb |
|
||||
| Firmware | CC48 |
|
||||
| User capacity | 137,438,952,960 bytes [**137 GB**] |
|
||||
| Expected capacity | 5,000,981,078,016 bytes [5.00 TB] |
|
||||
| Rotation | 7200 rpm (reported) |
|
||||
| SATA | 3.0, 6.0 Gb/s |
|
||||
|
||||
### SMART
|
||||
|
||||
```
|
||||
Read SMART Data failed: scsi error aborted command
|
||||
SMART Status command failed: scsi error aborted command
|
||||
SMART overall-health self-assessment test result: UNKNOWN!
|
||||
SMART Status, Attributes and Thresholds cannot be read.
|
||||
```
|
||||
|
||||
**Action:** Replace drive; see [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md).
|
||||
|
||||
---
|
||||
|
||||
## Full SMART attributes (healthy drives)
|
||||
|
||||
### `/dev/sda` — W4J0L0BA (mirror-0, ONLINE)
|
||||
|
||||
```
|
||||
SMART overall-health self-assessment test result: PASSED
|
||||
|
||||
ID# ATTRIBUTE_NAME VALUE WORST THRESH TYPE RAW_VALUE
|
||||
1 Raw_Read_Error_Rate 119 100 006 Pre-fail 211189952
|
||||
3 Spin_Up_Time 092 091 000 Pre-fail 0
|
||||
4 Start_Stop_Count 100 100 020 Old_age 350
|
||||
5 Reallocated_Sector_Ct 100 100 010 Pre-fail 0
|
||||
7 Seek_Error_Rate 080 060 030 Pre-fail 43979429424
|
||||
9 Power_On_Hours 041 041 000 Old_age 52481
|
||||
10 Spin_Retry_Count 100 100 097 Pre-fail 0
|
||||
12 Power_Cycle_Count 100 100 020 Old_age 345
|
||||
183 Runtime_Bad_Block 100 100 000 Old_age 0
|
||||
184 End-to-End_Error 100 100 099 Old_age 0
|
||||
187 Reported_Uncorrect 100 100 000 Old_age 0
|
||||
188 Command_Timeout 100 099 000 Old_age 3 3 3
|
||||
189 High_Fly_Writes 100 100 000 Old_age 0
|
||||
190 Airflow_Temperature_Cel 073 058 045 Old_age 27 (Min/Max 27/28)
|
||||
191 G-Sense_Error_Rate 100 100 000 Old_age 0
|
||||
192 Power-Off_Retract_Count 100 100 000 Old_age 0
|
||||
193 Load_Cycle_Count 001 001 000 Old_age 348974
|
||||
194 Temperature_Celsius 027 042 000 Old_age 27
|
||||
195 Hardware_ECC_Recovered 119 100 000 Old_age 211189952
|
||||
197 Current_Pending_Sector 100 100 000 Old_age 0
|
||||
198 Offline_Uncorrectable 100 100 000 Old_age 0
|
||||
199 UDMA_CRC_Error_Count 200 200 000 Old_age 0
|
||||
240 Head_Flying_Hours 100 253 000 Old_age 15140h+51m+12.276s
|
||||
241 Total_LBAs_Written 100 253 000 Old_age 57665101118
|
||||
242 Total_LBAs_Read 100 253 000 Old_age 160962549062
|
||||
```
|
||||
|
||||
### `/dev/sdc` — W4J0K9V7 (mirror-1, ONLINE)
|
||||
|
||||
```
|
||||
SMART overall-health self-assessment test result: PASSED
|
||||
|
||||
ID# ATTRIBUTE_NAME VALUE WORST THRESH TYPE RAW_VALUE
|
||||
1 Raw_Read_Error_Rate 117 100 006 Pre-fail 136042192
|
||||
3 Spin_Up_Time 092 091 000 Pre-fail 0
|
||||
4 Start_Stop_Count 100 100 020 Old_age 367
|
||||
5 Reallocated_Sector_Ct 100 100 010 Pre-fail 0
|
||||
7 Seek_Error_Rate 083 060 030 Pre-fail 22512744055
|
||||
9 Power_On_Hours 040 040 000 Old_age 53087
|
||||
10 Spin_Retry_Count 100 100 097 Pre-fail 0
|
||||
12 Power_Cycle_Count 100 100 020 Old_age 363
|
||||
183 Runtime_Bad_Block 100 100 000 Old_age 0
|
||||
184 End-to-End_Error 100 100 099 Old_age 0
|
||||
187 Reported_Uncorrect 100 100 000 Old_age 0
|
||||
188 Command_Timeout 100 099 000 Old_age 6 6 12
|
||||
189 High_Fly_Writes 096 096 000 Old_age 4
|
||||
190 Airflow_Temperature_Cel 070 060 045 Old_age 30 (Min/Max 28/30)
|
||||
191 G-Sense_Error_Rate 100 100 000 Old_age 0
|
||||
192 Power-Off_Retract_Count 100 100 000 Old_age 0
|
||||
193 Load_Cycle_Count 001 001 000 Old_age 340961
|
||||
194 Temperature_Celsius 030 040 000 Old_age 30
|
||||
195 Hardware_ECC_Recovered 117 100 000 Old_age 136042192
|
||||
197 Current_Pending_Sector 100 100 000 Old_age 0
|
||||
198 Offline_Uncorrectable 100 100 000 Old_age 0
|
||||
199 UDMA_CRC_Error_Count 200 200 000 Old_age 0
|
||||
240 Head_Flying_Hours 100 253 000 Old_age 15859h+53m+20.869s
|
||||
241 Total_LBAs_Written 100 253 000 Old_age 57609506493
|
||||
242 Total_LBAs_Read 100 253 000 Old_age 152392393081
|
||||
```
|
||||
|
||||
### `/dev/sdd` — W4J0LKCD (mirror-1, ONLINE)
|
||||
|
||||
```
|
||||
SMART overall-health self-assessment test result: PASSED
|
||||
|
||||
ID# ATTRIBUTE_NAME VALUE WORST THRESH TYPE RAW_VALUE
|
||||
1 Raw_Read_Error_Rate 116 090 006 Pre-fail 108217848
|
||||
3 Spin_Up_Time 092 091 000 Pre-fail 0
|
||||
4 Start_Stop_Count 100 100 020 Old_age 310
|
||||
5 Reallocated_Sector_Ct 100 100 010 Pre-fail 0
|
||||
7 Seek_Error_Rate 073 051 030 Pre-fail 185584998742
|
||||
9 Power_On_Hours 048 048 000 Old_age 45580
|
||||
10 Spin_Retry_Count 100 100 097 Pre-fail 0
|
||||
12 Power_Cycle_Count 100 100 020 Old_age 309
|
||||
183 Runtime_Bad_Block 100 100 000 Old_age 0
|
||||
184 End-to-End_Error 100 100 099 Old_age 0
|
||||
187 Reported_Uncorrect 100 100 000 Old_age 0
|
||||
188 Command_Timeout 100 099 000 Old_age 8 8 14
|
||||
189 High_Fly_Writes 098 098 000 Old_age 2
|
||||
190 Airflow_Temperature_Cel 070 050 045 Old_age 30 (Min/Max 29/30)
|
||||
191 G-Sense_Error_Rate 100 100 000 Old_age 0
|
||||
192 Power-Off_Retract_Count 100 100 000 Old_age 0
|
||||
193 Load_Cycle_Count 008 008 000 Old_age 184891
|
||||
194 Temperature_Celsius 030 050 000 Old_age 30
|
||||
195 Hardware_ECC_Recovered 116 100 000 Old_age 108217848
|
||||
197 Current_Pending_Sector 100 091 000 Old_age 0
|
||||
198 Offline_Uncorrectable 100 091 000 Old_age 0
|
||||
199 UDMA_CRC_Error_Count 200 200 000 Old_age 0
|
||||
240 Head_Flying_Hours 100 253 000 Old_age 11604h+15m+50.842s
|
||||
241 Total_LBAs_Written 100 253 000 Old_age 72962800596
|
||||
242 Total_LBAs_Read 100 253 000 Old_age 167268621195
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How this audit was collected
|
||||
|
||||
On PVENAS as root:
|
||||
|
||||
```bash
|
||||
zpool status NAS.SP00
|
||||
lsblk -d -o NAME,SIZE,MODEL,SERIAL,ROTA,STATE /dev/sd{a,b,c,d}
|
||||
for d in sda sdb sdc sdd; do smartctl -i -H -A /dev/$d; done
|
||||
```
|
||||
|
||||
Audit timestamp (host local): Thu May 21 22:13:58 2026 EDT.
|
||||
|
||||
---
|
||||
|
||||
## Next steps
|
||||
|
||||
1. Replace **W4J0L3PY** with a 5 TB+ NAS-class HDD (match ST5000DM000-1FK178 or better).
|
||||
2. `zpool replace NAS.SP00` with the new disk by-id.
|
||||
3. Monitor resilver; run `zpool scrub NAS.SP00` after pool is **ONLINE**.
|
||||
4. Re-run SMART audit after replacement for a clean baseline.
|
||||
263
docs/guides/security-audit-report.md
Normal file
263
docs/guides/security-audit-report.md
Normal file
@ -0,0 +1,263 @@
|
||||
# Security Audit Report
|
||||
|
||||
**Date:** 2026-05-20
|
||||
**Auditor:** Automated read-only scan (`scripts/security-audit-*.sh`)
|
||||
**Scope:** Proxmox nodes `pve201` (10.0.10.201) and `pve10` (10.0.10.10), all LXCs via `pct exec`, SSH deep-dive on hypervisors.
|
||||
|
||||
**Repo baseline** (`roles/ssh/defaults/main.yml`): `PermitRootLogin prohibit-password`, `PasswordAuthentication no`, UFW enabled.
|
||||
|
||||
---
|
||||
|
||||
## Executive summary
|
||||
|
||||
| Area | Critical | High | Medium |
|
||||
|------|----------|------|--------|
|
||||
| Hypervisors (201, 10) | 2 | 4 | 2 |
|
||||
| LXCs on 201 (10 running) | 0 | 10 | 8 |
|
||||
| LXCs on 10 (3 running) | 0 | 3 | 3 |
|
||||
|
||||
**Top priorities**
|
||||
|
||||
1. Harden **SSH on both Proxmox hosts** (root + passwords currently allowed).
|
||||
2. Restrict **Proxmox API/UI port 8006** to admin IPs.
|
||||
3. Disable **password SSH on all LXCs**; deploy keys + `make copy-ssh-keys` for inventory IPs.
|
||||
4. Patch hosts with **40–105** pending apt upgrades (hypervisors worst).
|
||||
5. Put **HTTP services** (8080, 8000, qBit, etc.) behind reverse proxy + TLS or bind to internal IPs.
|
||||
|
||||
---
|
||||
|
||||
## Proxmox hypervisors
|
||||
|
||||
### pve201 — 10.0.10.201 (`pve`)
|
||||
|
||||
| Resource | Status |
|
||||
|----------|--------|
|
||||
| OS | Debian 12, PVE 8.4.16, kernel 6.8.12-18-pve |
|
||||
| RAM free | ~2.5 GB / 126 GB (**critical**) |
|
||||
| Pending apt | **105** |
|
||||
| UFW / fail2ban / unattended-upgrades | **None** |
|
||||
|
||||
#### SSH audit (dedicated)
|
||||
|
||||
| Setting | Current | Target |
|
||||
|---------|---------|--------|
|
||||
| `permitrootlogin` | **yes** | `prohibit-password` |
|
||||
| `passwordauthentication` | **yes** | `no` |
|
||||
| `pubkeyauthentication` | yes | yes |
|
||||
| `maxauthtries` | 6 | 3–4 |
|
||||
| `x11forwarding` | yes | no (on servers) |
|
||||
| Root keys | 3 keys in `authorized_keys` | audit/remove unused |
|
||||
|
||||
#### Exposed services
|
||||
|
||||
| Port | Service | Risk |
|
||||
|------|---------|------|
|
||||
| 22 | SSH | Brute-force (no fail2ban) |
|
||||
| 8006 | Proxmox API/UI | **Critical** — full cluster control |
|
||||
| 3128 | spiceproxy | Medium |
|
||||
| 111 | rpcbind | Low — reduce exposure |
|
||||
|
||||
#### Fixes (pve201)
|
||||
|
||||
```bash
|
||||
# 1) SSH — prefer Ansible after limiting to your IP
|
||||
make copy-ssh-key HOST=pve201 # if needed
|
||||
# Manual quick fix on host:
|
||||
sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
||||
sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
sshd -t && systemctl reload sshd
|
||||
|
||||
# 2) Proxmox firewall — Datacenter → Firewall → restrict 8006 to 10.0.10.0/24 or admin IP
|
||||
# Or iptables on host for port 8006
|
||||
|
||||
# 3) fail2ban
|
||||
apt install fail2ban -y
|
||||
systemctl enable --now fail2ban
|
||||
|
||||
# 4) Auto security updates
|
||||
apt install unattended-upgrades apt-listchanges -y
|
||||
dpkg-reconfigure -plow unattended-upgrades
|
||||
|
||||
# 5) Patch
|
||||
apt update && apt upgrade -y
|
||||
```
|
||||
|
||||
**Ansible (when ready):** add `pve201` / `pve10` to a `proxmox` group play with `roles/ssh` + `roles/monitoring_server` (fail2ban). Do **not** lock yourself out — test with second session first.
|
||||
|
||||
---
|
||||
|
||||
### pve10 — 10.0.10.10 (`PVENAS`)
|
||||
|
||||
| Resource | Status |
|
||||
|----------|--------|
|
||||
| OS | Debian 13 (trixie), PVE, kernel 6.17.13-3-pve |
|
||||
| Load | **~30** on 24 CPUs (overloaded) |
|
||||
| Pending apt | **92** |
|
||||
| UFW / fail2ban / unattended-upgrades | **None** |
|
||||
| ZFS `NAS.SP00` | **inactive** (I/O suspended) |
|
||||
| PBS `PVEBUVD00` → 10.0.10.200:8007 | **unreachable** |
|
||||
|
||||
#### SSH audit (dedicated)
|
||||
|
||||
Same as pve201: `permitrootlogin yes`, `passwordauthentication yes`, 3 root authorized_keys.
|
||||
|
||||
#### Exposed services
|
||||
|
||||
| Port | Service | Risk |
|
||||
|------|---------|------|
|
||||
| 22 | SSH | High |
|
||||
| 8006 | Proxmox API/UI | **Critical** |
|
||||
| 2049, mountd, statd | NFS/RPC | High on LAN |
|
||||
| 3128 | spiceproxy | Medium |
|
||||
|
||||
#### Fixes (pve10)
|
||||
|
||||
Same SSH / fail2ban / unattended-upgrades / patch steps as pve201.
|
||||
|
||||
Additional:
|
||||
|
||||
```bash
|
||||
# Investigate ZFS pool
|
||||
zpool status NAS.SP00
|
||||
# Fix PBS connectivity or remove stale datastore from Proxmox UI
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## LXCs on pve201 (via `pct exec`)
|
||||
|
||||
| VMID | Name | IP | Status | SSH root | Password auth | UFW | fail2ban | Upgrades | Public services |
|
||||
|------|------|-----|--------|----------|---------------|-----|----------|----------|-----------------|
|
||||
| 301 | vikunja-debian | 10.0.10.159 | running | without-password | **yes** | no | no | 0 | **3456**, 22 |
|
||||
| 302 | qbit-debian | 10.0.10.91 | running | without-password | **yes** | no | no | 0 | **8080** (qBit), 22 |
|
||||
| 303 | searchXNG-debian | 10.0.10.70 | running | without-password | **yes** | no | no | **83** | **8080**, 22 |
|
||||
| 304 | wireguard-debian | 10.0.10.192 | running | without-password | **yes** | no | no | 0 | 22 |
|
||||
| 305 | kuma-debian | 10.0.10.197 | **stopped** | — | — | — | — | — | replaced by LXC 218 |
|
||||
| 306 | portfolio | — | **destroyed** | — | — | — | — | — | migrated → pve10 LXC **219** @ `10.0.10.106` (purged 2026-05-22) |
|
||||
| 307 | jobber-delian | 10.0.10.178 | running | without-password | **yes** | no | no | **83** | **3005**, 22 |
|
||||
| 308 | stirling-pdf | 10.0.10.43 | running | without-password | **yes** | no | no | 0 | **8080**, 22 |
|
||||
| 9001 | pote-dev | 10.0.10.114 | **stopped** | — | — | — | — | — | — |
|
||||
| 9101 | punimTagFE-dev | 10.0.10.121 | running | without-password | **yes** | **active** | no | **89** | **8000**, 111, 22 |
|
||||
| 9401 | mirrormatch-dev | 10.0.10.141 | **stopped** | — | — | — | — | — | — |
|
||||
|
||||
**Inventory mapping:** `vikanjans` → 159, `qBittorrent` → 91, `punimTag` app → 121.
|
||||
|
||||
### Common LXC issues (pve201)
|
||||
|
||||
| Issue | Severity | Fix |
|
||||
|-------|----------|-----|
|
||||
| `passwordauthentication yes` on all LXCs | High | Set `PasswordAuthentication no` in `/etc/ssh/sshd_config`, reload sshd |
|
||||
| No fail2ban | High | Install fail2ban or rely on Proxmox FW + LAN segmentation |
|
||||
| Apps on `0.0.0.0:8080` / 8000 / 3456 | High | Bind to localhost + Caddy, or restrict via Proxmox guest firewall (`firewall=1` on net0 — enable rules) |
|
||||
| 79–89 pending upgrades on several CTs | Medium | `pct exec <id> -- apt update && apt upgrade -y` |
|
||||
| Stopped dev CTs (9001, 9401) | Low | Start when needed or keep stopped to reduce attack surface |
|
||||
|
||||
### Per-LXC fixes (pve201)
|
||||
|
||||
```bash
|
||||
# Example: harden + patch vikunja (301) from Proxmox host
|
||||
pct exec 301 -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
pct exec 301 -- systemctl reload ssh
|
||||
|
||||
# Patch container
|
||||
pct exec 303 -- bash -c 'apt update && apt upgrade -y'
|
||||
|
||||
# Copy your SSH key (from Mac, once password/key works)
|
||||
make copy-ssh-key HOST=vikanjans # 10.0.10.159
|
||||
make copy-ssh-key HOST=qBittorrent # 10.0.10.91
|
||||
```
|
||||
|
||||
**punimTagFE-dev (9101):** Only LXC with **UFW active** — extend rules to deny inbound except 22 from admin subnet; still disable password auth.
|
||||
|
||||
---
|
||||
|
||||
## LXCs on pve10 (via `pct exec`)
|
||||
|
||||
| VMID | Name | IP | Status | SSH root | Password auth | UFW | fail2ban | Upgrades | Public services |
|
||||
|------|------|-----|--------|----------|---------------|-----|----------|----------|-----------------|
|
||||
| 210 | cal | 10.0.10.228 | running | without-password | **yes** | no | no | 0 | **3000**, 22 |
|
||||
| 215 | caseware | 10.0.10.105 | running | without-password | **yes** | no | no | **40** | **80** (nginx), 22 |
|
||||
| 216 | auto | 10.0.10.59 | running | without-password | **yes** | no | no | **40** | **80** (nginx), 22 |
|
||||
|
||||
**Inventory mapping:** `caseware` → 105, `auto` → 59.
|
||||
|
||||
### Fixes (pve10 LXCs)
|
||||
|
||||
```bash
|
||||
# SSH harden caseware (215)
|
||||
pct exec 215 -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
pct exec 215 -- systemctl reload sshd
|
||||
|
||||
# Patch
|
||||
pct exec 215 -- apt update && apt upgrade -y
|
||||
pct exec 216 -- apt update && apt upgrade -y
|
||||
|
||||
# Deploy keys from Mac
|
||||
make copy-ssh-key HOST=caseware
|
||||
make copy-ssh-key HOST=auto
|
||||
```
|
||||
|
||||
**HTTP port 80 on caseware/auto:** Ensure TLS termination on Caddy (inventory host `caddy` 10.0.10.50) and no plain HTTP from WAN if exposed.
|
||||
|
||||
---
|
||||
|
||||
## SSH hardening checklist (all Linux targets)
|
||||
|
||||
Use this order to avoid lockout:
|
||||
|
||||
1. Confirm your key works: `ssh -o BatchMode=yes root@<ip> true`
|
||||
2. Set `PasswordAuthentication no`
|
||||
3. Set `PermitRootLogin prohibit-password` (LXCs already `without-password` — equivalent for keys-only)
|
||||
4. `sshd -t && systemctl reload sshd`
|
||||
5. Open **second terminal** and test before closing first
|
||||
6. Optional: change SSH port, `MaxAuthTries 4`, disable `X11Forwarding`
|
||||
|
||||
**Ansible alignment:**
|
||||
|
||||
```bash
|
||||
# After keys on host
|
||||
make dev HOST=<hostname> --tags security
|
||||
# or role ssh via playbooks that include roles/ssh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Re-run audits
|
||||
|
||||
```bash
|
||||
# Hypervisor full audit
|
||||
ssh root@10.0.10.201 'bash -s' < scripts/security-audit-remote.sh
|
||||
ssh root@10.0.10.10 'bash -s' < scripts/security-audit-remote.sh
|
||||
|
||||
# Hypervisor SSH-only
|
||||
ssh root@10.0.10.201 'bash -s' < scripts/security-audit-ssh.sh
|
||||
|
||||
# All LXCs on a node
|
||||
ssh root@10.0.10.201 'bash -s' < scripts/security-audit-lxc-via-pve.sh
|
||||
ssh root@10.0.10.10 'bash -s' < scripts/security-audit-lxc-via-pve.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Tracking
|
||||
|
||||
| Item | Owner | Status |
|
||||
|------|-------|--------|
|
||||
| SSH harden pve201 | | ☐ |
|
||||
| SSH harden pve10 | | ☐ |
|
||||
| Restrict 8006 on both nodes | | ☐ |
|
||||
| fail2ban on hypervisors | | ☐ |
|
||||
| Patch pve201 / pve10 | | ☐ |
|
||||
| Disable password SSH on all LXCs | | ☐ |
|
||||
| `copy-ssh-keys` for inventory | | ☐ |
|
||||
| TLS for :80/:8080 services | | ☐ |
|
||||
| Fix ZFS NAS.SP00 on pve10 | | ☐ |
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- **[Security remediation plan](security-remediation-plan.md)** — phased fixes (critical → low) and login model
|
||||
- [Security hardening guide](security.md)
|
||||
- [SECURITY_HARDENING_PLAN.md](../SECURITY_HARDENING_PLAN.md)
|
||||
- Role defaults: `roles/ssh/defaults/main.yml`
|
||||
459
docs/guides/security-remediation-plan.md
Normal file
459
docs/guides/security-remediation-plan.md
Normal file
@ -0,0 +1,459 @@
|
||||
# Security Remediation Plan
|
||||
|
||||
**Based on:** [security-audit-report.md](security-audit-report.md) (2026-05-20)
|
||||
**Goal:** Align hosts with `roles/ssh` (keys only, no password SSH) without locking yourself out.
|
||||
|
||||
---
|
||||
|
||||
## How you should log in (not “ladmin → root” everywhere)
|
||||
|
||||
Your inventory uses **different users on purpose**. After hardening, the pattern is:
|
||||
|
||||
| Host type | Inventory user | How you work | Root access |
|
||||
|-----------|----------------|--------------|-------------|
|
||||
| **Proxmox** (`pve201`, `pve10`) | `root` | `ssh root@10.0.10.201` with **your SSH key** | Direct root (keys only, no password) |
|
||||
| **Dev / QA** (`dev01`, `git-ci-01`, …) | `ladmin` (or `beast`, `master`) | `ssh ladmin@host` with **key** | `sudo` for admin tasks; Ansible `become: true` |
|
||||
| **Services** (caddy, jellyfin, …) | often `root` | `ssh root@host` with **key** | Direct root (keys only) |
|
||||
| **Optional bootstrap** | — | `make bootstrap-root-ssh HOST=x` | One-time: key on `ladmin` → `su` to install **root** key → then harden SSH |
|
||||
|
||||
**You do not need** “SSH ladmin then su root” on Proxmox if you keep managing them as `root` in inventory — you need **root + SSH key + passwords disabled**.
|
||||
|
||||
**You do** use ladmin → sudo on dev/qa boxes where `ansible_user=ladmin`. That is normal: unprivileged (or sudo) login + elevation, not password guessing on root.
|
||||
|
||||
**`PermitRootLogin prohibit-password`** means: root may log in **only with a key**, never with a password. It does **not** mean “ban root; use ladmin only.”
|
||||
|
||||
**`PasswordAuthentication no`** means: **nobody** (root, ladmin, etc.) can SSH with a password — keys only.
|
||||
|
||||
---
|
||||
|
||||
## Phases overview
|
||||
|
||||
| Phase | When | Focus |
|
||||
|-------|------|--------|
|
||||
| **0 — Backup + prep** | Before any change | Snapshots, `sshd` copies, git commit, keys, second SSH session |
|
||||
| **1 — Critical** | Week 1 | Proxmox SSH + 8006, keys everywhere, RAM on 201 |
|
||||
| **2 — High** | Week 1–2 | LXCs SSH, fail2ban, patching, app ports |
|
||||
| **3 — Medium** | Week 2–4 | unattended-upgrades, Ansible `make security`, TLS |
|
||||
| **4 — Low** | Ongoing | rpcbind, naming, stopped CTs, Mac, docs |
|
||||
|
||||
---
|
||||
|
||||
## Phase 0 — Backup (before any hardening)
|
||||
|
||||
**Yes — back up first.** SSH and firewall mistakes can lock you out; patches can break services. Use the right backup type per layer.
|
||||
|
||||
### What to back up (by layer)
|
||||
|
||||
| Layer | What | Method | Rollback if SSH breaks |
|
||||
|-------|------|--------|-------------------------|
|
||||
| **Your Mac** | Ansible repo + `~/.ansible-vault-pass` (secure copy) + SSH keys | Time Machine / git commit / copy `~/.ssh` | N/A |
|
||||
| **Proxmox hosts** | `/etc/ssh/sshd_config`, `/etc/pve/`, firewall rules | Copy files + **Proxmox snapshot** optional | **Console** in web UI (`pct enter` / VM console) |
|
||||
| **Each LXC/VM** | Full guest state | **Proxmox snapshot** or `vzdump` | Restore snapshot or rollback CT |
|
||||
| **Dev workstations** | OS + home (if Timeshift installed) | `make timeshift-snapshot HOST=dev02` | `make timeshift-restore` |
|
||||
| **Central PBS** | — | **Not reliable today** — `10.0.10.200` unreachable | Fix PBS later; don’t depend on it for this work |
|
||||
|
||||
### 0A — Mac / repo (5 minutes)
|
||||
|
||||
```bash
|
||||
cd ~/Documents/code/ansible
|
||||
git status
|
||||
git add -A && git commit -m "Pre-security-hardening baseline" # if you want a restore point
|
||||
|
||||
# Store vault passphrase somewhere safe (password manager), NOT only on disk
|
||||
# Optional: encrypted copy of ~/.ansible-vault-pass offline
|
||||
```
|
||||
|
||||
### 0B — Proxmox: config files (both nodes)
|
||||
|
||||
```bash
|
||||
for pve in 10.0.10.201 10.0.10.10; do
|
||||
ssh root@$pve "mkdir -p /root/pre-hardening-$(date +%Y%m%d) && \
|
||||
cp -a /etc/ssh/sshd_config /root/pre-hardening-$(date +%Y%m%d)/ && \
|
||||
cp -a /etc/pve /root/pre-hardening-$(date +%Y%m%d)/pve-etc 2>/dev/null; \
|
||||
ls -la /root/pre-hardening-$(date +%Y%m%d)/"
|
||||
done
|
||||
```
|
||||
|
||||
### 0C — Proxmox: snapshots (recommended before SSH/firewall on PVE)
|
||||
|
||||
**Running LXCs on pve201** (from audit): 301–308, 9101 — snapshot each before `pct exec` SSH changes.
|
||||
|
||||
**Running LXCs on pve10:** 210, 215, 216.
|
||||
|
||||
```bash
|
||||
# On pve201 — snapshot (fast, local-lvm; needs free space)
|
||||
ssh root@10.0.10.201 'for id in 301 302 303 304 305 306 307 308 9101; do
|
||||
name=$(pct list | awk -v i=$id "$1==i {print \$4}")
|
||||
echo "Snapshot vmid=$id ($name)"
|
||||
pct snapshot $id pre-ssh-hardening-$(date +%Y%m%d) || echo "FAILED $id"
|
||||
done'
|
||||
|
||||
# On pve10
|
||||
ssh root@10.0.10.10 'for id in 210 215 216; do
|
||||
pct snapshot $id pre-ssh-hardening-$(date +%Y%m%d) || echo "FAILED $id"
|
||||
done'
|
||||
```
|
||||
|
||||
**Optional full backup** (slower, larger) — important CTs only if snapshots fail (low disk on 201):
|
||||
|
||||
```bash
|
||||
vzdump <vmid> --storage local --mode snapshot --compress zstd
|
||||
```
|
||||
|
||||
**Check space on pve201 first** (~2.5 GB RAM + disk — snapshot needs free space on `local-lvm`):
|
||||
|
||||
```bash
|
||||
ssh root@10.0.10.201 'pvesm status; free -h'
|
||||
```
|
||||
|
||||
If snapshots fail for lack of space: do **0B only** on PVE, then harden SSH using **Proxmox console** as safety net (no snapshot).
|
||||
|
||||
### 0D — Inventory VMs with Timeshift (`dev` group)
|
||||
|
||||
Only where Timeshift is already installed (e.g. `dev02`):
|
||||
|
||||
```bash
|
||||
make timeshift-snapshot HOST=dev02
|
||||
make timeshift-list HOST=dev02
|
||||
```
|
||||
|
||||
Not used on Proxmox or most LXCs by default.
|
||||
|
||||
### 0E — Export current SSH settings (audit trail)
|
||||
|
||||
```bash
|
||||
mkdir -p ~/security-hardening-backup-$(date +%Y%m%d)
|
||||
ssh root@10.0.10.201 'bash -s' < scripts/security-audit-ssh.sh > ~/security-hardening-backup-$(date +%Y%m%d)/pve201-ssh.txt
|
||||
ssh root@10.0.10.10 'bash -s' < scripts/security-audit-ssh.sh > ~/security-hardening-backup-$(date +%Y%m%d)/pve10-ssh.txt
|
||||
ssh root@10.0.10.201 'bash -s' < scripts/security-audit-lxc-via-pve.sh > ~/security-hardening-backup-$(date +%Y%m%d)/pve201-lxc.txt
|
||||
```
|
||||
|
||||
### Backup exit criteria (do not skip)
|
||||
|
||||
- [ ] Git commit (or branch) for ansible repo
|
||||
- [ ] `sshd_config` (+ optional `/etc/pve`) copied on **both** PVE nodes
|
||||
- [ ] Proxmox snapshots **or** documented reason skipped (disk/RAM)
|
||||
- [ ] Second SSH session tested to `pve201` / `pve10`
|
||||
- [ ] You know how to open **Proxmox → VM/CT → Console** if SSH fails
|
||||
|
||||
### Rollback quick reference
|
||||
|
||||
| Problem | Rollback |
|
||||
|---------|----------|
|
||||
| Bad `sshd_config` on PVE | Console → restore `/root/pre-hardening-*/sshd_config` → `systemctl reload sshd` |
|
||||
| Bad LXC SSH | `pct rollback <vmid> pre-ssh-hardening-YYYYMMDD` |
|
||||
| Bad patch on CT | Same snapshot rollback |
|
||||
| Locked out of LAN on 8006 | Console → disable/datacenter firewall rule |
|
||||
|
||||
---
|
||||
|
||||
## Phase 0 — Prep (after backups)
|
||||
|
||||
| # | Task | Command / notes |
|
||||
|---|------|----------------|
|
||||
| 0.1 | Confirm vault password file | `~/.ansible-vault-pass` |
|
||||
| 0.2 | Bootstrap control node | `make bootstrap` |
|
||||
| 0.3 | Verify key on Proxmox | `ssh -o BatchMode=yes root@10.0.10.201 true` |
|
||||
| 0.4 | Copy keys to inventory | `make copy-ssh-keys` (or per group) |
|
||||
| 0.5 | Document admin IP | e.g. `10.0.10.127` for firewall rules |
|
||||
| 0.6 | Open **second terminal** before changing `sshd` | Test login before closing first session |
|
||||
|
||||
**Exit criteria:** Backups done (above) + key login works to `pve201`, `pve10`, and hosts you will harden next.
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 — Critical
|
||||
|
||||
### 1.1 Proxmox SSH (pve201 + pve10)
|
||||
|
||||
**Issue:** `PermitRootLogin yes` + `PasswordAuthentication yes` — password brute force on root.
|
||||
|
||||
**Fix (per host, after 0.3):**
|
||||
|
||||
```bash
|
||||
# On pve201 OR pve10 — keep existing session open!
|
||||
sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
||||
sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
sshd -t && systemctl reload sshd
|
||||
```
|
||||
|
||||
**Verify (new terminal):** `ssh -o BatchMode=yes root@10.0.10.201 true`
|
||||
|
||||
**Ansible (later):** dedicated play for `[proxmox]` with `roles/ssh` (today `make security` only targets `dev` playbook).
|
||||
|
||||
| Host | Priority |
|
||||
|------|----------|
|
||||
| pve201 | P0 |
|
||||
| pve10 | P0 |
|
||||
|
||||
---
|
||||
|
||||
### 1.2 Restrict Proxmox UI/API (port 8006)
|
||||
|
||||
**Issue:** Anyone on LAN can hit full cluster API.
|
||||
|
||||
**Fix (choose one):**
|
||||
|
||||
- **A — Proxmox firewall (recommended):** Datacenter → Firewall → add rule: accept `8006` from `10.0.10.0/24` and/or your Mac IP; drop others.
|
||||
- **B — SSH tunnel only:** no LAN exposure; `ssh -L 8006:127.0.0.1:8006 root@10.0.10.201` → browser `https://127.0.0.1:8006`.
|
||||
|
||||
**Do not** block 8006 globally without A or B in place.
|
||||
|
||||
---
|
||||
|
||||
### 1.3 RAM on pve201 (~2.5 GB free)
|
||||
|
||||
**Issue:** New guests or updates risk OOM.
|
||||
|
||||
**Fix:**
|
||||
|
||||
```bash
|
||||
ssh root@10.0.10.201 'free -h; pct list'
|
||||
# Stop non-essential CTs/VMs or migrate workload to pve10
|
||||
```
|
||||
|
||||
Review running guests from `make proxmox-info ALL=true`; stop labs you do not need.
|
||||
|
||||
---
|
||||
|
||||
### 1.4 Deploy SSH keys to unreachable inventory hosts
|
||||
|
||||
**Issue:** Cannot audit or Ansible-manage hosts without keys.
|
||||
|
||||
**Order:**
|
||||
|
||||
1. `make copy-ssh-key HOST=caddy` (and each `[services]` host)
|
||||
2. `make bootstrap-root-ssh HOST=listmonk` where root password still works but key does not
|
||||
3. `make copy-ssh-keys GROUP=qa` for `ladmin` hosts
|
||||
|
||||
**Exit criteria:** `make ping` succeeds for each group you will harden in phase 2.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 — High
|
||||
|
||||
### 2.1 LXC SSH — disable password auth (all running CTs)
|
||||
|
||||
**Issue:** `passwordauthentication yes` on every audited LXC.
|
||||
|
||||
**Fix from Proxmox host (no Mac SSH to CT required):**
|
||||
|
||||
```bash
|
||||
# pve201 — example for each running VMID
|
||||
for id in 301 302 303 304 305 306 307 308 9101; do
|
||||
pct exec $id -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
pct exec $id -- bash -c 'sshd -t && systemctl reload sshd' || pct exec $id -- systemctl reload ssh
|
||||
done
|
||||
|
||||
# pve10
|
||||
for id in 210 215 216; do
|
||||
pct exec $id -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
pct exec $id -- systemctl reload sshd
|
||||
done
|
||||
```
|
||||
|
||||
**Before disable:** install your key on CTs you need (`make copy-ssh-key HOST=vikanjans`, etc.).
|
||||
|
||||
**Note:** CTs already have `permitrootlogin without-password` — keep that; only turn off passwords.
|
||||
|
||||
---
|
||||
|
||||
### 2.2 fail2ban on hypervisors
|
||||
|
||||
**Issue:** No brute-force protection on SSH (and eventually 8006 if proxied).
|
||||
|
||||
```bash
|
||||
ssh root@10.0.10.201 'apt install -y fail2ban && systemctl enable --now fail2ban'
|
||||
ssh root@10.0.10.10 'apt install -y fail2ban && systemctl enable --now fail2ban'
|
||||
```
|
||||
|
||||
Optional: extend to high-value LXCs via `roles/monitoring_server` or manual install.
|
||||
|
||||
---
|
||||
|
||||
### 2.3 Patch backlog
|
||||
|
||||
| Target | Pending | Action |
|
||||
|--------|---------|--------|
|
||||
| pve201 | ~105 | `apt update && apt upgrade -y` (maintenance window) |
|
||||
| pve10 | ~92 | same |
|
||||
| LXCs 303, 306, 307, 9101 | 79–89 | `pct exec <id> -- apt update && apt upgrade -y` |
|
||||
| caseware, auto (pve10) | ~40 | same |
|
||||
|
||||
**Order:** hypervisors first (after snapshot), then LXCs one by one.
|
||||
|
||||
---
|
||||
|
||||
### 2.4 Application ports on `0.0.0.0`
|
||||
|
||||
**Issue:** HTTP services exposed on LAN without TLS/auth.
|
||||
|
||||
| LXC / host | Port | Fix |
|
||||
|------------|------|-----|
|
||||
| qbit (91) | 8080 | Prefer VPN; or Caddy + auth; bind to internal IP |
|
||||
| searchXNG (70) | 8080 | Same |
|
||||
| punimTagFE (121) | 8000 | Behind Caddy; firewall allow only 10.0.10.0/24 |
|
||||
| vaultwarden (142) | 8080 | Already in inventory — reverse proxy + TLS |
|
||||
| portfolio | **106:80** (pve10 LXC 219, nginx) | Migrated 2026-05-22; pve201 LXC **306 destroyed** |
|
||||
| vikunja (159) | 3456 | Proxy via Caddy (`todo.levkin.ca`) |
|
||||
|
||||
**Pattern:** App listens `127.0.0.1` only; **Caddy** (`10.0.10.50`) terminates TLS for public URLs in inventory.
|
||||
|
||||
---
|
||||
|
||||
### 2.5 pve10 infrastructure
|
||||
|
||||
| Issue | Fix |
|
||||
|-------|-----|
|
||||
| ZFS `NAS.SP00` suspended | `zpool status`; import/clear errors |
|
||||
| PBS 10.0.10.200 unreachable | Fix network/service or remove stale datastore |
|
||||
| Load ~30 | Identify heavy VMs; migrate or stop |
|
||||
|
||||
---
|
||||
|
||||
## Phase 3 — Medium
|
||||
|
||||
### 3.1 unattended-upgrades
|
||||
|
||||
Hypervisors + important LXCs:
|
||||
|
||||
```bash
|
||||
apt install -y unattended-upgrades apt-listchanges
|
||||
dpkg-reconfigure -plow unattended-upgrades
|
||||
```
|
||||
|
||||
### 3.2 Ansible security roles (by group)
|
||||
|
||||
Today `make security` runs `playbooks/development.yml` on **`dev` only**.
|
||||
|
||||
**Expand with new/changed playbooks:**
|
||||
|
||||
| Group | Playbook idea | Roles |
|
||||
|-------|---------------|-------|
|
||||
| `[proxmox]` | `playbooks/infrastructure/proxmox-hardening.yml` | `ssh`, monitoring_server |
|
||||
| `[services]` | extend `playbooks/servers.yml` | `ssh`, `base`, fail2ban |
|
||||
| `[qa]` | tag run on qa hosts | `ssh` |
|
||||
| LXCs | optional `pct` + Ansible over SSH after keys | `ssh` |
|
||||
|
||||
**Workflow:**
|
||||
|
||||
```bash
|
||||
make check HOST=pve201 # after proxmox play exists
|
||||
make dev HOST=dev01 --tags security
|
||||
```
|
||||
|
||||
### 3.3 UFW on LXCs
|
||||
|
||||
Only **punimTagFE-dev** has UFW today. Template for others:
|
||||
|
||||
- Allow 22 from `10.0.10.0/24`
|
||||
- Allow app port only if needed on LAN
|
||||
- Default deny incoming
|
||||
|
||||
Use `roles/ssh` UFW tasks or Proxmox guest firewall (`firewall=1` on `net0`).
|
||||
|
||||
### 3.4 Align names / inventory
|
||||
|
||||
| Proxmox name | Ansible | Action |
|
||||
|--------------|---------|--------|
|
||||
| punimTagFE-dev | punimTag-dev | Rename CT or update `app_projects` name |
|
||||
| vikunja-debian | vikanjans | OK (IP 159) |
|
||||
| qbit-debian | qBittorrent | OK (IP 91) |
|
||||
|
||||
### 3.5 Mac (control machine)
|
||||
|
||||
| Issue | Fix |
|
||||
|-------|-----|
|
||||
| Firewall off | System Settings → Firewall → On |
|
||||
| FileVault off | Enable FileVault |
|
||||
| Docker on `*:3000` | Bind to `127.0.0.1` unless LAN needed |
|
||||
|
||||
---
|
||||
|
||||
## Phase 4 — Low
|
||||
|
||||
| Item | Fix |
|
||||
|------|-----|
|
||||
| rpcbind (111) on pve201 / 9101 | Disable if unused: `systemctl disable rpcbind` |
|
||||
| X11Forwarding on Proxmox | Set `no` in sshd |
|
||||
| Stopped CTs 9001, 9401 | Leave stopped or destroy if unused |
|
||||
| `make security-audit` target | Add Makefile → runs audit scripts, appends to report |
|
||||
| Quarterly re-audit | Re-run `scripts/security-audit-lxc-via-pve.sh` |
|
||||
|
||||
---
|
||||
|
||||
## Suggested calendar
|
||||
|
||||
| Week | Critical | High | Medium |
|
||||
|------|----------|------|--------|
|
||||
| **1** | 0.x prep, 1.1 SSH both PVE, 1.2 firewall 8006, 1.4 keys | 2.1 LXC passwords off (after keys), 2.2 fail2ban | — |
|
||||
| **2** | 1.3 RAM 201 | 2.3 patch PVE + LXCs, 2.4 Caddy for 8080 services | 3.1 unattended-upgrades |
|
||||
| **3** | — | 2.5 pve10 ZFS/PBS/load | 3.2 Ansible plays for proxmox + services |
|
||||
| **4** | — | — | 3.3 UFW, 3.4 naming, 3.5 Mac |
|
||||
|
||||
---
|
||||
|
||||
## Rollback (if locked out of SSH)
|
||||
|
||||
- Proxmox: use **console** in web UI (or physical/IPMI) → edit `/etc/ssh/sshd_config` → `PasswordAuthentication yes` temporarily → reload sshd.
|
||||
- LXC: `pct enter <vmid>` from PVE host.
|
||||
|
||||
---
|
||||
|
||||
## Tracking checklist
|
||||
|
||||
Copy into your issue tracker or tick in [security-audit-report.md](security-audit-report.md):
|
||||
|
||||
**Backup (Phase 0 — before everything)**
|
||||
|
||||
- [ ] Git commit / branch for ansible repo
|
||||
- [ ] PVE `sshd_config` backup on 201 + 10
|
||||
- [ ] Proxmox CT snapshots (or vzdump) on critical LXCs
|
||||
- [ ] Audit outputs saved locally (`security-hardening-backup-*`)
|
||||
- [ ] Console access tested in Proxmox UI
|
||||
|
||||
**Critical**
|
||||
|
||||
- [ ] pve201 SSH: prohibit-password + no passwords
|
||||
- [ ] pve10 SSH: same
|
||||
- [ ] 8006 restricted to admin subnet/IP
|
||||
- [ ] SSH keys on all inventory hosts
|
||||
- [ ] pve201 RAM relieved
|
||||
|
||||
**High**
|
||||
|
||||
- [ ] All running LXCs: PasswordAuthentication no
|
||||
- [ ] fail2ban on pve201 + pve10
|
||||
- [ ] Patch pve201, pve10, LXCs with 40+ upgrades
|
||||
- [ ] qBit / searchXNG / punimTag / vaultwarden port exposure reduced
|
||||
- [ ] pve10 ZFS + PBS investigated
|
||||
|
||||
**Medium**
|
||||
|
||||
- [ ] unattended-upgrades on PVE + key LXCs
|
||||
- [ ] `make security` (or new plays) for proxmox, services, qa
|
||||
- [ ] UFW on critical LXCs
|
||||
- [ ] Mac firewall + FileVault
|
||||
|
||||
**Low**
|
||||
|
||||
- [ ] rpcbind, X11, audit Makefile, naming cleanup
|
||||
|
||||
---
|
||||
|
||||
## Quick reference: your login after plan
|
||||
|
||||
```bash
|
||||
# Proxmox
|
||||
ssh root@10.0.10.201 # key only
|
||||
|
||||
# Dev / QA
|
||||
ssh ladmin@10.0.10.223 # key only → sudo -i when you need root
|
||||
|
||||
# Services (inventory root)
|
||||
ssh root@10.0.10.50 # key only
|
||||
|
||||
# Proxmox UI (if 8006 restricted)
|
||||
ssh -L 8006:127.0.0.1:8006 root@10.0.10.201
|
||||
# → https://127.0.0.1:8006
|
||||
```
|
||||
62
docs/guides/site-lxc-git.md
Normal file
62
docs/guides/site-lxc-git.md
Normal file
@ -0,0 +1,62 @@
|
||||
# Site LXCs — git deploy (caseware / auto / portfolio)
|
||||
|
||||
## Remotes (correct)
|
||||
|
||||
Use **`git.levkin.ca`**, not `10.0.30.169`:
|
||||
|
||||
```
|
||||
git@git.levkin.ca:ilia/caseware.git
|
||||
git@git.levkin.ca:ilia/auto.git
|
||||
git@git.levkin.ca:ilia/sdetProfile.git
|
||||
```
|
||||
|
||||
Gitea VM is **`10.0.10.169`** on pve10. Public `git.levkin.ca:22` hits your home IP and is **closed**; git SSH uses LAN IP via `~/.ssh/config`.
|
||||
|
||||
## SSH config (on site LXC, as root)
|
||||
|
||||
```ssh
|
||||
# /root/.ssh/config
|
||||
Host git.levkin.ca
|
||||
HostName 10.0.10.169
|
||||
User git
|
||||
IdentityFile ~/.ssh/id_ed25519
|
||||
StrictHostKeyChecking accept-new
|
||||
```
|
||||
|
||||
## Deploy keys
|
||||
|
||||
Each LXC should use its **own** deploy key in Gitea (**Repo → Settings → Deploy Keys**). Gitea allows a public key only **once per server** — if you see *“already been added to the server”*, generate a repo-specific key:
|
||||
|
||||
```bash
|
||||
# On portfolio LXC 219 (via pve10)
|
||||
pct exec 219 -- cat /root/.ssh/id_ed25519_gitea.pub
|
||||
```
|
||||
|
||||
Portfolio uses `~/.ssh/id_ed25519_gitea` in `/root/.ssh/config` for `Host git.levkin.ca` (`IdentitiesOnly yes`).
|
||||
|
||||
| LXC | Repo | Key file / comment |
|
||||
|-----|------|---------------------|
|
||||
| 215 | caseware | `~/.ssh/id_ed25519` → `root@caseware` |
|
||||
| 216 | auto | `~/.ssh/id_ed25519` → `root@auto` |
|
||||
| 219 | sdetProfile | `~/.ssh/id_ed25519_gitea` → `deploy-portfolio-sdetProfile` |
|
||||
|
||||
## Push / pull
|
||||
|
||||
```bash
|
||||
# On LXC (via pve10)
|
||||
pct exec 215 -- bash -c 'cd /var/www/caseware && git pull origin main && git push origin main'
|
||||
pct exec 216 -- bash -c 'cd /var/www/auto && git pull origin master && git push origin master'
|
||||
pct exec 219 -- bash -c 'cd /var/www/portfolio && git pull origin master && git push origin master'
|
||||
```
|
||||
|
||||
After editing `index.html`, commit on the LXC, push, then hard-refresh the public site.
|
||||
|
||||
## Gitea VM SSH (git@10.0.10.169)
|
||||
|
||||
If deploy keys fail after adding them in the UI:
|
||||
|
||||
1. Keys live in `/var/lib/gitea/.ssh/authorized_keys` (regenerated by Gitea).
|
||||
2. OpenSSH logs in as user **`git`** — copy/sync that file to **`/home/git/.ssh/authorized_keys`** (`chown git:git`, mode `600`).
|
||||
3. `command=` must run **`gitea serv`** as user **`gitea`** (e.g. `sudo -n -E -u gitea /usr/bin/gitea …`) with `SSH_ORIGINAL_COMMAND` preserved in sudoers.
|
||||
|
||||
Portfolio uses repo path **`ilia/sdetprofile`** (lowercase on disk).
|
||||
95
docs/guides/unifi-static-dhcp.md
Normal file
95
docs/guides/unifi-static-dhcp.md
Normal file
@ -0,0 +1,95 @@
|
||||
# UniFi static DHCP (10.0.10.x homelab)
|
||||
|
||||
**Controller:** https://192.168.2.1/
|
||||
**Goal:** Pin Proxmox VM MAC addresses to stable `10.0.10.x` addresses so Caddy and Ansible inventory do not drift.
|
||||
|
||||
LXCs on pve10 (**210, 215–219**) are already static via `pct set` — **no UniFi lease needed** for those rows. This guide is for **VMs** (and pve201 guests) that still use DHCP.
|
||||
|
||||
---
|
||||
|
||||
## Before you start
|
||||
|
||||
1. Confirm guests get addresses on **`10.0.10.0/24`** (not only `192.168.2.x`). In UniFi, open the network that faces Proxmox `vmbr0`.
|
||||
2. Gateway for homelab guests should be **`10.0.10.1`** (or your router’s IP on that VLAN).
|
||||
3. Use the MAC table in [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md).
|
||||
|
||||
---
|
||||
|
||||
## Method A — From a connected client (easiest)
|
||||
|
||||
1. Open **https://192.168.2.1/** and sign in.
|
||||
2. Go to **Clients** (or **UniFi Devices** → **Clients**).
|
||||
3. Find the device (hostname like `gitea`, `vaultwarden`, or MAC from Proxmox `qm config <vmid>`).
|
||||
4. Click the client → **Settings** (gear) or **⋮**.
|
||||
5. Enable **Fixed IP** / **Use fixed IP address**.
|
||||
6. Set IP to the target from the table (e.g. `10.0.10.169` for gitea).
|
||||
7. **Apply** / **Save**.
|
||||
8. On the VM: renew DHCP or reboot:
|
||||
```bash
|
||||
sudo dhclient -r && sudo dhclient
|
||||
# or: reboot
|
||||
```
|
||||
9. Verify: `ip -4 addr show` shows the reserved IP.
|
||||
|
||||
---
|
||||
|
||||
## Method B — DHCP static mapping (manual MAC)
|
||||
|
||||
1. **Settings** → **Networks**.
|
||||
2. Open the LAN/VLAN that serves **10.0.10.x** (name varies: `Default`, `Homelab`, `10.0.10`).
|
||||
3. **DHCP** section → **DHCP Static IP** / **Static leases** → **Create new**.
|
||||
4. Enter:
|
||||
- **MAC address** (from Proxmox, e.g. `BC:24:11:E9:BD:E5`)
|
||||
- **IP address** (e.g. `10.0.10.169`)
|
||||
- **Name** (optional, e.g. `giteaVM`)
|
||||
5. Save. Repeat for each row in the reservations table.
|
||||
6. Renew DHCP on each VM or reboot.
|
||||
|
||||
---
|
||||
|
||||
## Already static (skip UniFi DHCP)
|
||||
|
||||
| VMID | Name | IP | How |
|
||||
|------|------|-----|-----|
|
||||
| 210 | cal | 10.0.10.228 | `pct set` |
|
||||
| 215 | caseware | 10.0.10.105 | `pct set` |
|
||||
| 216 | auto | 10.0.10.59 | `pct set` |
|
||||
| 217 | identity | 10.0.10.21 | `pct set` |
|
||||
| 218 | monitoring | 10.0.10.22 | `pct set` |
|
||||
| 219 | portfolio | 10.0.10.106 | `pct set` (`iliadobkin.com`) |
|
||||
| 106 | caddy | 10.0.10.50 | static in `/etc/network/interfaces` |
|
||||
|
||||
---
|
||||
|
||||
## Priority order — UniFi reservations (VMs / pve201)
|
||||
|
||||
| Order | Guest | IP | MAC | Notes |
|
||||
|-------|-------|-----|-----|-------|
|
||||
| 1 | giteaVM | 10.0.10.169 | BC:24:11:E9:BD:E5 | |
|
||||
| 2 | vaultwardenVM | 10.0.10.142 | BC:24:11:58:DB:DC | |
|
||||
| 3 | n8n (WRA) | 10.0.10.154 | BC:24:11:61:DE:7A | |
|
||||
| 4 | hermes | 10.0.10.36 | BC:24:11:51:1E:99 | |
|
||||
| 5 | actual | 10.0.10.158 | BC:24:11:10:7B:64 | |
|
||||
| 6 | jellyfin | 10.0.10.232 | BC:24:11:29:B8:84 | stopped until NAS OK |
|
||||
| 7 | listmonk (pve201 VM 113) | 10.0.10.148 | BC:24:11:11:53:9A | |
|
||||
| 8 | Mailcow (pve201) | 10.0.10.132 | BC:24:11:34:75:2D | |
|
||||
| 9 | TrueNAS | 10.0.10.107 | BC:24:11:14:DE:B5 | optional pin |
|
||||
| 10 | PVE.BU.SVR | 10.0.10.200 | BC:24:11:DA:95:3B | lab VM |
|
||||
|
||||
Full MAC table: [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md).
|
||||
|
||||
---
|
||||
|
||||
## If you only see 192.168.2.x in UniFi
|
||||
|
||||
Your Mac may be on `192.168.2.0/24` while Proxmox guests use a separate **`10.0.10.0/24`** network. In that case:
|
||||
|
||||
- Add or edit a UniFi network/VLAN for `10.0.10.0/24`, or
|
||||
- Ensure the router bridges/routes between `192.168.2.x` and `10.0.10.x`, and
|
||||
- Put DHCP reservations on the network that actually serves the Proxmox bridge.
|
||||
|
||||
---
|
||||
|
||||
## After reservations
|
||||
|
||||
Mark `✅ router` in [host-list.md](host-list.md) for each guest.
|
||||
36
docs/guides/vm-static-ip-router-reservations.md
Normal file
36
docs/guides/vm-static-ip-router-reservations.md
Normal file
@ -0,0 +1,36 @@
|
||||
# VM static IPs — router DHCP reservations (pve10)
|
||||
|
||||
Proxmox **LXCs** use `pct set … ip=10.0.10.X/24` (done for 210, 215–219).
|
||||
|
||||
**VMs** without cloud-init are pinned by **router DHCP reservation by MAC** (Method B in plan-2). Ansible **cannot log into your router** — configure static leases in the UI.
|
||||
|
||||
**Your UniFi:** https://192.168.2.1/ — step-by-step: [unifi-static-dhcp.md](unifi-static-dhcp.md).
|
||||
|
||||
Homelab guests use **`10.0.10.0/24`** (gateway `10.0.10.1`). If UniFi also serves `192.168.2.x`, ensure the `10.0.10.x` segment is the network those VMs/LXCs plug into (or that routing/DHCP relay matches your Proxmox bridge).
|
||||
|
||||
## How to add a reservation (any router)
|
||||
|
||||
1. Open router admin (UniFi: **https://192.168.2.1/**).
|
||||
2. Find **DHCP** / **LAN** / **Static leases** / **Reserved addresses**.
|
||||
3. For each row: **MAC address** → **IP address** → Save.
|
||||
4. Reboot guest or renew DHCP (`dhclient -r && dhclient` on Debian) if IP does not update immediately.
|
||||
5. Mark done in [host-list.md](host-list.md).
|
||||
|
||||
| VMID | Name | MAC | Reserve IP | Inventory |
|
||||
|------|------|-----|------------|-----------|
|
||||
| 102 | gitea-alpine | `BC:24:11:E9:BD:E5` | `10.0.10.169` | giteaVM |
|
||||
| 103 | WRA / n8n | `BC:24:11:61:DE:7A` | `10.0.10.154` | n8n |
|
||||
| 104 | vaultwarden | `BC:24:11:58:DB:DC` | `10.0.10.142` | vaultwardenVM |
|
||||
| 105 | TrueNAS | `BC:24:11:14:DE:B5` | `10.0.10.107` | — |
|
||||
| 106 | caddy | `BC:24:11:E0:49:B4` | `10.0.10.50` | ✅ static in-guest |
|
||||
| 108 | actual | `BC:24:11:10:7B:64` | `10.0.10.158` | actual |
|
||||
| 117 | hermes | `BC:24:11:51:1E:99` | `10.0.10.36` | hermes (guest agent on) |
|
||||
| 200 | PVE.BU.SVR | `BC:24:11:DA:95:3B` | `10.0.10.200` | — |
|
||||
| 201 | NextcloudAIO | `BC:24:11:14:D4:DE` | `10.0.10.24` | **decommission** — skip new work |
|
||||
| 101 | Jellyfin | `BC:24:11:29:B8:84` | `10.0.10.232` | stopped |
|
||||
| 113 | listmonk (pve201) | `BC:24:11:11:53:9A` | `10.0.10.148` | listmonk |
|
||||
| — | Mailcow (pve201 VM 106) | `BC:24:11:34:75:2D` | `10.0.10.132` | mailcow (inventory) |
|
||||
|
||||
After reserving in the router, mark **DHCP/Static** as `✅ router` in [host-list.md](host-list.md).
|
||||
|
||||
In-guest static (optional, stronger): SSH as root and set `/etc/network/interfaces` like caddy VM 106.
|
||||
15
inventories/production/group_vars/all/mailcow.yml
Normal file
15
inventories/production/group_vars/all/mailcow.yml
Normal file
@ -0,0 +1,15 @@
|
||||
---
|
||||
# Mailcow mailbox definitions (passwords live in vault only).
|
||||
# Create: make mailcow-mailbox MAILBOX=<key>
|
||||
# Add a new key under mailcow_mailboxes + vault_mailcow_mailbox_passwords.<key>
|
||||
mailcow_url: "https://mail.levkine.ca"
|
||||
mailcow_domain: "levkine.ca"
|
||||
|
||||
mailcow_mailboxes:
|
||||
alerts:
|
||||
local_part: alerts
|
||||
name: Monitoring Alerts
|
||||
quota: 1024
|
||||
vault_password_key: alerts
|
||||
|
||||
mailcow_api_key: "{{ vault_mailcow_api_key | default('') }}"
|
||||
@ -26,6 +26,10 @@ maintenance_pre_reboot_delay: 5 # Delay before reboot in seconds
|
||||
|
||||
# Default Tailscale settings - these tell the playbook to use your vault key
|
||||
tailscale_auth_key: "{{ vault_tailscale_auth_key | default('') }}"
|
||||
|
||||
# Mailcow — API key + per-mailbox passwords in vault; definitions in group_vars/all/mailcow.yml
|
||||
mailcow_api_key: "{{ vault_mailcow_api_key | default('') }}"
|
||||
mailcow_mailbox_passwords: "{{ vault_mailcow_mailbox_passwords | default({}) }}"
|
||||
tailscale_accept_routes: true
|
||||
tailscale_accept_dns: true
|
||||
tailscale_ssh: false
|
||||
|
||||
@ -22,6 +22,33 @@ vault_ssh_public_key: "ssh-ed25519 AAAA... you@example"
|
||||
# LXC create bootstrap password (often required by Proxmox)
|
||||
vault_lxc_root_password: "CHANGE_ME"
|
||||
|
||||
# Mailcow API — System → Configuration → Access → API (read/write)
|
||||
vault_mailcow_api_key: "CHANGE_ME"
|
||||
# Per-mailbox passwords (make mailcow-mailbox MAILBOX=<key>)
|
||||
vault_mailcow_mailbox_passwords:
|
||||
alerts: "CHANGE_ME"
|
||||
# Legacy alias (optional)
|
||||
vault_alerts_mailbox_password: "CHANGE_ME"
|
||||
|
||||
# Uptime Kuma + SMTP (monitoring LXC)
|
||||
vault_uptime_kuma_url: "http://10.0.10.22:3001"
|
||||
vault_uptime_kuma_user: "admin"
|
||||
vault_uptime_kuma_password: "CHANGE_ME"
|
||||
vault_kuma_smtp_host: "mail.levkine.ca"
|
||||
vault_kuma_smtp_port: "587"
|
||||
vault_kuma_smtp_user: "alerts@levkine.ca"
|
||||
vault_kuma_smtp_password: "CHANGE_ME"
|
||||
vault_kuma_smtp_to: "idobkin@gmail.com"
|
||||
|
||||
# Umami (monitoring LXC /opt/monitoring/.env)
|
||||
vault_umami_db_password: "CHANGE_ME"
|
||||
vault_umami_app_secret: "CHANGE_ME"
|
||||
|
||||
# Hermes Mattermost (not Telegram)
|
||||
vault_mattermost_url: "https://slack.levkin.ca"
|
||||
vault_mattermost_token: "CHANGE_ME"
|
||||
vault_mattermost_allowed_users: "CHANGE_ME"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# POTE (python/venv + cron) secrets
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@ -1,100 +1,125 @@
|
||||
$ANSIBLE_VAULT;1.1;AES256
|
||||
38316537376634623462313731323238666165383731656632373665653534623163386333303865
|
||||
3865383030316132663831303932376437346335323233630a643331663539383163306666393764
|
||||
38313265656561343839616565343663353037663237663032366632373831363336306632626266
|
||||
3361643865333533340a356233663034343932323831323236356161396237346532323838373135
|
||||
33393239313730363336613338373039663735323431323562613363343863326234633833663631
|
||||
66343462623231663932633537373361313764393630356666393662653135356139663935613038
|
||||
65383261363065633235343031346535373564373931373063386265343335623265653739613830
|
||||
32656233393330633362623932316431383761306332393466313936396533333839313831663331
|
||||
34353864356336303331663233653666363966376162303731626134313235306238323363303439
|
||||
32333039653235326632303637303065386161616138356463623561366637376366326262303166
|
||||
38323763393934666539373063323265333961666164613437316164633565393035626538353365
|
||||
33386562336665383863636639643232623161643933313664396534383362303838663362653736
|
||||
64393334616165336638306235363734653431646431616139373336656333623963386538646230
|
||||
39663230363063386231343730663162313463666135323265613261626637626332353534396535
|
||||
31623664363766646332396336396133613662643232366433323330373962633839613635333763
|
||||
63306230623438346639323863353137363330316630316130326134323731326635643736373736
|
||||
62336362656265633233623165376436373231656666303832373966353732313031623865316663
|
||||
63356163636238346230623732326232646434623532633439646536656362393162613535613565
|
||||
66616539316362376561386263373464623030636661663435383839643565393632616232663035
|
||||
34653735383964653930633664346330386566343830336238306562343164366131643138643339
|
||||
35313366356637643262636238366263353535306434633732623335643266396335666636666663
|
||||
37333232393765306433326164663538663839623034373535653737633366303665633831303334
|
||||
32303061363863386139613464326466336136396534663538643163343439343763383534306636
|
||||
62353733613330376163386331626463656462336237656339356132643135363537343638303261
|
||||
33366332653439313137613665386136666536356537346665333935366336623734393738346434
|
||||
63326265346362636564366265373134336662626332653464646139656635313961656230336537
|
||||
63666638326337643033363964643339666130386139363138656165666333356465643337396165
|
||||
30336330633632353231613938646165383966613863366330646162646266346139343434393865
|
||||
66346365663230626531643963383462636465363965393762336233366538393133313138616335
|
||||
32353834313762363265643031343237633732393166343139363163326439666162396332353038
|
||||
31306530626666343361313736313636613335376163383237303063393333386663333333336137
|
||||
37346166316231623638386635613230663063653037643930333961316434643361633035633734
|
||||
65643937636361653433383262643265373165613437336236633631323635613034663834646665
|
||||
30373730373438613132633932333565376665333565383932356334653738646166393934626362
|
||||
30666666303832613633316230623038343165396338343535663931383639623430643238656261
|
||||
39623037333063306266323335303736346236636137633863353866343136346335353865303961
|
||||
31346331333066376330306361396262333762393838303165383134303435353630366130303536
|
||||
34386532356239326166386665623435646432636561363564656161646563306234333138333839
|
||||
38316337656631313763393135396464643338386636336234346663653538353863643636323032
|
||||
35326133623064363838386662653138613438386564316635373838366262656364666633636539
|
||||
61306563666138656161336466323537626161313366616662623362643036636132663634313137
|
||||
39653437306662646162613763343736636530356465346132646238633166373838353836326461
|
||||
36326666323636353239303262623436643932353164323630326635653635653233363265316264
|
||||
30653763643431626539356161376534396437636463303363663134373961616561363561333333
|
||||
34306537326666383664336464656464623731656566653132613565336536323438666333366466
|
||||
64613738653730333633383062653837366266316536653139643362373039383831363666333934
|
||||
34383833336266356436666636323239336432386133303466636138643934356266326533643161
|
||||
36393664313963393930383533623565383332613933396639613037323266663439313138326261
|
||||
30353861303661303836343165353362663632306430626337356562343637653164396237333566
|
||||
37656230363530323836373363646334356262646633313932383161303264613238373936353036
|
||||
61376264633930356465626266623930333039383032316163633037323035346130343934616261
|
||||
31666166393462366561303833353135326566356637376466613934376233303162323033623031
|
||||
63656131333439353537623662363530383866326432306361316465383137633536666364623662
|
||||
37353561633839623530333663643130326131333330626661396636343234666139336539653162
|
||||
62383636663137626637303535333862366434626161353239393232313537343865646564626331
|
||||
39366665363030643764663963316163343033326434373265343664393439316333346434363563
|
||||
61346164396561343865626362616433306230333130653166656230353364316536626432373333
|
||||
35383133363530666263316431396462383133363965336637386632363263656261353963313161
|
||||
36383632326264373436383638383064346334336238656239393833653531656461356136303434
|
||||
37663434663732306631656334306361663562303863386135623066633963373034373139666332
|
||||
35393433646333363839666434663535363661616330386234366132303161383063663836626561
|
||||
35393064343735303032313266643338623834383838633834636536363539656466663864613366
|
||||
66636363623330326436363936313938333638323939323035616232366563316364343834376630
|
||||
66656434336661643861613737616138396330383832386230383331646462323363373363393733
|
||||
63363237636137373566363438663966396432613964336164326138623737393636396234646232
|
||||
64343361363365356135666235623833396131626663303839653535663732313831633163643638
|
||||
35396262373837343238343838663635353838373338663732626330613237623332336436643136
|
||||
38653833383430393837383566643765653834306636356466326364303334653034626262356630
|
||||
34333338333336373433356235386337346666343830303164363235303265313134323339653339
|
||||
63316238346132653663653165313635336638646362356337643766366564383531633565303431
|
||||
66616433663630343439336661346266336139613537653438653432326666326137306364376137
|
||||
66333939643262633532363966623439373434393862353237613135646663623236646331643537
|
||||
31353566653464313433636635393330646166613232633734346639326534373163383064353732
|
||||
32373861303064346266643338316465653031646633633936373738663837383162643534623131
|
||||
31633662356534343636313834386139656439663733333762323962323939623032396239356437
|
||||
37633739613433613365313337383835623936623530363831383535663337343264356532616434
|
||||
39393634396664636166346631313764343733666534613935393637363233373331303837656463
|
||||
37363266363634353136316532333462396266373733333633356239653334363835326261323661
|
||||
66323032346364356230613831643236316530356132343863393361343462373433383265336333
|
||||
30343730316366366234333263343965633466333439653739663333643939303631353664316435
|
||||
36396139623562656632666165666662626263643436396431326135633932393965656531633761
|
||||
39303634643936366438336534613532303134343164326661626363656562383564623264636132
|
||||
39656636303636393761653035303832386430646162343830343834316534636263373763643765
|
||||
61366335643531666232303231656336643833396238336639333437363564636566636632303364
|
||||
62623738336237393638363436396662656565653839643164356565313563663561666237383036
|
||||
33626464663465643230376164653062663063636630613064643632643235643662653566333333
|
||||
62353763643830363638323731303537633837393235656661333263323536363330356362643333
|
||||
34346666656432626365383639326538643862346265316263326531623631383962383734316330
|
||||
39333430613761663337306331623461643635653431343336663163343766373464366538313335
|
||||
61643538643231333636643836663663313534356662386532633331346664653262353839643066
|
||||
36393366653131316636646336313362656662666163333635633132323438353435373430643839
|
||||
37623936393962333065663536306238653466363634386632366637363265303734356535333735
|
||||
64623330303965393533326563643063303762646664666464643239386435343065326234306632
|
||||
35346338373866303838613933653230373737396134653533376265356432333933356237636338
|
||||
66656536393530316435323863373962636465333331653364626162326562393565313538633264
|
||||
34613633393862333731336563636136666166613037613833333063303162373339663539646631
|
||||
36303962356562306239616634376339356135666663303836353061663039343836356262373932
|
||||
65346466373532633365383835323062313531623130396130376531626333653862393462643631
|
||||
366330333666336262373364663864336633
|
||||
38333966306139633330626334636166336434613661376233313731353237353562376237323166
|
||||
3736663161376133336431353334306533316337633662310a646533656261633333306433626564
|
||||
33626434353661343431316632643938663639356531336564653230353439316236343861643665
|
||||
6231393530333937340a333033366564393536613330373232373861666439316536336164306633
|
||||
31373433653531363636663262616535643137363039356534313462653232663663343464303938
|
||||
33633838373935333433653732656261633463653835393864353862346563303063656431343065
|
||||
61323331363032383365613734373165343530303230373237346162306361613461353939623934
|
||||
64303138383537386435653461356130356563653036343339333761303030393933393735616531
|
||||
33386462303037613263373036386332656563346539633131366636333163376162613231313337
|
||||
64336137373038636233346539616136343933343635353639633633616438333739303864376162
|
||||
36656639313966633234323738326435373935363166626664613561636637396166353961623262
|
||||
31333064306537376631656235636265313235643339353735373666316364616432336536303830
|
||||
39393136393864383035633462366637396438323838643337633361373132363365616333613431
|
||||
30326533366265303165653761333034656261363862353061383761363530666135373265623332
|
||||
33373538616433383835663139383065366433333939356366353635633834666362646465366130
|
||||
31636235613934313465646136623834343062353539653163373032326130303034653365653431
|
||||
63306635323431376562396236653966633833396262343664643562366235393961316564656565
|
||||
61356436313363376233376137303062656462363933643465616436353964373837383536306136
|
||||
35626163393638353261633030653164643063626463383133666137323333633463616138643931
|
||||
38346633653430303031643830363166363561346336646666343330303164336164333561386535
|
||||
66616661306133626164343166303362383262636331313465343434643262353862313438616462
|
||||
33383734626463616330666265636265623064326635633066656533306530376663653366613534
|
||||
36666337346238333137303931633631366236373236383932343763653637343434336462343662
|
||||
64313239313435353365383338376133386639326136636164386439306665663965353565333030
|
||||
65363139636134656333616335643435643038373832383134636666303536663236303231313030
|
||||
61616664373264663763343334303437643264396435373230333561323036363764383730373461
|
||||
61636661316330373732363835303039346438313133393862306138613634333334356633346232
|
||||
63666132303939656465356665323435326435333135303735346332613134633736333338653066
|
||||
31616532616537343735326232613235323364386636396531383333316633666338306635656565
|
||||
63316338343032346261343863623163353934653434363336643836353431643937393261393339
|
||||
61363562373533396631623830613431663262643631663637396663626466663634323037666662
|
||||
65663132393863333135663831386132646533353535326430323864396132343762623464643461
|
||||
35306330666635343362316239386463633161623664653063356561356166613332363432393730
|
||||
33646439663039653037383630356166323733373963353239643231326338633838623033633339
|
||||
66666630306130336632333736396335666437383164633466373534333334356261383538353363
|
||||
30623461333365633536663236363661323835356361363331653437613131303732643134343038
|
||||
32663338356462343535396534646263656331366265356532616234663966626138633031323866
|
||||
33346662336534323037353835333032633965326163623365643230666339363566353938623931
|
||||
33316539396538333433373236656339396165313930613331396135666236326231336563343063
|
||||
34646233336137323166663635323266613635343363636334353865343931616665613462613764
|
||||
30323865623164303333333166393963613535616563316531383231313239666337343961333938
|
||||
34663931343535333830333036646463356132613064663037323237366563656239343665653263
|
||||
32343535653037633931653565663166623736306166623363316632316236663534383938656564
|
||||
32633734373336383630663436373863343136663337306364326432663763326561363961623464
|
||||
31326263623935343933333739373038373838616432646533316230613762336236306338616163
|
||||
34333266316537646439343937366261303833363665373734386632613733313435336438343534
|
||||
61393363396261666265396361313063636334623765613564393736616461313438613234333661
|
||||
64383764653464373131326332656435343163613561623762663532643130666338633736393931
|
||||
34316535376235616533353831343537363533346331316332323439383837303631626261316564
|
||||
30383566363737643065356565346161376637646431633732636333373862653966323461356535
|
||||
64613964666135373038656364376334336631376261373338643737633266393761623837643730
|
||||
38316233626130383231623930346338306164653336643066656665356463313131343738316230
|
||||
66316133306134656330643532303538373661333161343133613266333465663534326231306461
|
||||
65653634373934323432303833353339356531313164346238623639373363393137336334306131
|
||||
39393463613032633533363236323730386133356135383030656261363761333765383831646238
|
||||
35386164353462646236306337393364323665316364626265363736316638353266626665393662
|
||||
38663137626361366334373033643864613664656631616532373935313031343633373631323533
|
||||
34656561396463336662313834653634306435616439336161323763313732313331663436633663
|
||||
35323961393133343566623937313064646532643638336163633538613465363138653161386238
|
||||
62333139336537656339333737363933346333633534396230356561303063626266666661366130
|
||||
38626338353336616161373334306165333930646563613436303233666563636462643435396233
|
||||
62323634393063616461653134353133323664346566663664383766313939653036303930633331
|
||||
66353762623338303530633463336533373634333734653430303139366637373130306561653264
|
||||
34333533666437343732363036356132313230323838373233636631336434313563336366316466
|
||||
63393633363461393164323063396238346262623136623639383963616662323137633139323766
|
||||
31303765323730303863376166386631643031306130396338376538373362323335643964303137
|
||||
62626131656262613437383036636438383262396533646163363365326134633834666236333335
|
||||
65633037626335376230303937366463376664363062366361663362373434656637636230623561
|
||||
39626634343761303030346365633333333039326364303762326461316361343231363932323336
|
||||
39623033303232316263323433366638393435336563636138343261636561356363366138653033
|
||||
62373731623461363135383037613065396264333966353436613466663931343033326363323138
|
||||
62306133613163633134626138663434356562633936346239373837336439653061613762626533
|
||||
38623366313464393631666330353738393538366537313637613732613532663339653637616633
|
||||
65623637373230333738343136393332376364316438633164306539336233373065396339373562
|
||||
31383163316231356538626333323533663863383339643363303334323833353164356662326530
|
||||
34653630663330663330323864333965303236313266393636333839643863666236646665633137
|
||||
33353038626562663266386161393331326636353862643233326231623063623463313231373862
|
||||
31333639626232306339373435386562663035303633383333653066643361643139356134633264
|
||||
33363832353735633462363761343138323234356530656136636236623365353531356337393234
|
||||
37363133333763643863373338616532666464336238363631636131313261326164313430363434
|
||||
62363730623464343532653431353266336262363262373933646234653563663535363133343634
|
||||
64663535363231353738303663626166383831383531363130373466633532356635313530383432
|
||||
63636462656236303033376637643462616230626163373832666337636263333866313466616563
|
||||
62613162363633353235363039366365396662383335386165373233633539616530363264653266
|
||||
62643138333631353138336366646632386563353431343737363265353065373834326432623265
|
||||
30663630323361353635613363633032386465623139376630653038376536616462326134343363
|
||||
37643638323731313065653931663739306134323861313538313965636632653064393033376231
|
||||
36663666633836646636376166356361633961626466383030656162363362396566333832393439
|
||||
62306265386638333138363764646331643136636566343736613862343233303461633661643832
|
||||
35653839303039383233373532643632353964343365396131393933636537656334316466313531
|
||||
36633364643230336161316639313130316131663663393966333162373632386635393130313263
|
||||
64656439663135373265383732316435346135376563356630316662333664353564333038313730
|
||||
66346131396132366632306633656334376334653038646535383135636665396362343238346663
|
||||
36643132666434633730653431346265353662613265326230653333396239626633346633343231
|
||||
38303739303665343933633439623131333632383432343962653130396666373164633431653663
|
||||
35353264653833306163646164376234666364363766336564346332393831336537663936346433
|
||||
37346438353835353736316530323336336334376133663834363161326563353966356534333830
|
||||
64656164356661343462646536366234323062323164636434333863346337303661366164646562
|
||||
64383666343339346332643832616266346439353863616138613965373764333261356331316466
|
||||
62643939643461363238386463346638373630333437633737636630666161323461616539306634
|
||||
64646666626461306563393830396661313636633332396132363961373038386566646230323739
|
||||
62373064323761316135613538663132316365633339356664316365383234303635663435363239
|
||||
34336236663435643563376130396535623137333466363536393031303139356565313766656432
|
||||
64313365383631383034313831393462666437663733633165643230663539613630643264376631
|
||||
66653861313639666235613034633935633836656638643764343639373931366332373837343765
|
||||
61313765326362303963666165373364663664313631373136623437343837396165313930636165
|
||||
39323030303839333036393432383731303030643430643766383662366335386230623163303733
|
||||
37303232346534333433626330343637313534363562653133383966356538396638663762326530
|
||||
35336166393763626466323863663137386531356436306530323738373365643635613231636564
|
||||
62333839336137353833353036323533333163663331663033633938633533626637653538613038
|
||||
38613539303534366437633135616631303261643135616436653664326132356636653931306564
|
||||
62616434353733303863376361356465613531306534376333613261323764303137306266636434
|
||||
64363238633736643361393730626666656664333233616361643834373239623230303533343935
|
||||
31343362333735386338643433613333613736323639646562323437313733303331396136383762
|
||||
31663137386431386630343666663139363736313731323930313539313939623832313864386637
|
||||
66316531343238303936323234653033303666333233323334623837653665353565666335323638
|
||||
37363466373363333362656563383066366434306262323363336533356531363861356162326162
|
||||
66316135653963323765343934306630633132353036346536613663386339393632393764303530
|
||||
62333330306136346265306237393435353430313635393339363038313137623663316331656539
|
||||
31396361623230326433393239626536636437623737363131653363646237656165346463643338
|
||||
35306536376634336264643564346163373233666330393630633339346533653963346630396139
|
||||
36363430303866616334666631653732306230626238653463626132666638643938623030373538
|
||||
32353062626562396134393230386562346163643531376630616161646633333131383437386330
|
||||
34393665646530306663
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
---
|
||||
$ANSIBLE_VAULT;1.1;AES256
|
||||
66633265383239626163633134656233613638643862323562373330643363323036333334646566
|
||||
3439646635343533353432323064643135623532333738380a353866643461636233376432396434
|
||||
|
||||
4
inventories/production/host_vars/hermes.yml
Normal file
4
inventories/production/host_vars/hermes.yml
Normal file
@ -0,0 +1,4 @@
|
||||
---
|
||||
# Hermes agent VM 117 @ 10.0.10.36 (user: hermes, admin: ladmin)
|
||||
# Secrets: vault_hermes_telegram_bot_token, mattermost in /home/hermes/.hermes/secrets/
|
||||
hermes_home: /home/hermes/.hermes
|
||||
4
inventories/production/host_vars/localhost.yml
Normal file
4
inventories/production/host_vars/localhost.yml
Normal file
@ -0,0 +1,4 @@
|
||||
---
|
||||
# Control node (runs playbooks with connection: local).
|
||||
# Use project venv so API deps (proxmoxer, etc.) match `make bootstrap`.
|
||||
ansible_python_interpreter: "{{ inventory_dir }}/../../.venv/bin/python3"
|
||||
7
inventories/production/host_vars/mailcow.yml
Normal file
7
inventories/production/host_vars/mailcow.yml
Normal file
@ -0,0 +1,7 @@
|
||||
---
|
||||
# Mailcow VM 106 on pve201 (Mailcow-debian)
|
||||
# API/UI: https://mail.levkine.ca — domain levkine.ca (with e)
|
||||
# SSH: root only (no ladmin). First access: make copy-ssh-key-mailcow
|
||||
mailcow_url: "https://mail.levkine.ca"
|
||||
mailcow_domain: "levkine.ca"
|
||||
mailcow_alerts_user: "alerts"
|
||||
@ -13,8 +13,9 @@ pve201 ansible_host=10.0.10.201 ansible_user=root
|
||||
pve10 ansible_host=10.0.10.10 ansible_user=root
|
||||
|
||||
[sites]
|
||||
caseware ansible_host=10.0.10.105 ansible_user=root url=https://caseware.levkin.ca
|
||||
auto ansible_host=10.0.10.59 ansible_user=root url=https://auto.levkin.ca
|
||||
caseware ansible_host=10.0.10.105 ansible_user=root url=https://caseware.levkin.ca proxmox_vmid=215 proxmox_node=PVENAS
|
||||
auto ansible_host=10.0.10.59 ansible_user=root url=https://auto.levkin.ca proxmox_vmid=216 proxmox_node=PVENAS
|
||||
portfolio ansible_host=10.0.10.106 ansible_user=root url=https://iliadobkin.com proxmox_vmid=219 proxmox_node=PVENAS
|
||||
|
||||
[dev]
|
||||
dev01 ansible_host=10.0.30.105 ansible_user=ladmin
|
||||
@ -31,21 +32,30 @@ KrakenMint ansible_host=10.0.10.120 ansible_user=ladmin
|
||||
[ansible]
|
||||
ansibleVM ansible_host=10.0.10.157 ansible_user=master
|
||||
|
||||
[comms]
|
||||
# pve201 — email + newsletters
|
||||
mailcow ansible_host=10.0.10.132 ansible_user=root url=https://mail.levkine.ca proxmox_vmid=106 proxmox_node=pve201
|
||||
listmonk ansible_host=10.0.10.148 ansible_user=root url=https://listmonk.levkin.ca proxmox_node=pve201
|
||||
|
||||
[services]
|
||||
hermes ansible_host=10.0.10.36 ansible_user=ladmin url=https://hermes.levkin.ca proxmox_vmid=117
|
||||
caddy ansible_host=10.0.10.50 ansible_user=ladmin
|
||||
jellyfin ansible_host=10.0.10.232 ansible_user=root url=https://jelly.levkin.ca
|
||||
listmonk ansible_host=10.0.10.148 ansible_user=root url=https://listmonk.levkin.ca
|
||||
nextcloud ansible_host=10.0.10.25 ansible_user=root url=https://nextcloud.levkin.ca
|
||||
actual ansible_host=10.0.10.158 ansible_user=root url=https://budget.levkin.ca
|
||||
# VMID 117: on PVENAS (pve10)
|
||||
hermes ansible_host=10.0.10.36 ansible_user=ladmin url=https://hermes.levkin.ca proxmox_vmid=117 proxmox_node=PVENAS
|
||||
caddy ansible_host=10.0.10.50 ansible_user=ladmin proxmox_vmid=106 proxmox_node=PVENAS
|
||||
cal ansible_host=10.0.10.228 ansible_user=root url=https://cal.levkin.ca proxmox_vmid=210 proxmox_node=PVENAS
|
||||
identity ansible_host=10.0.10.21 ansible_user=root url=https://auth.levkin.ca proxmox_vmid=217 proxmox_node=PVENAS
|
||||
monitoring ansible_host=10.0.10.22 ansible_user=root url=http://10.0.10.22:3001 proxmox_vmid=218 proxmox_node=PVENAS uptime_kuma_port=3001 dockge_port=5001 umami_port=3000
|
||||
giteaVM ansible_host=10.0.10.169 ansible_user=root url=https://git.levkin.ca proxmox_vmid=102 proxmox_node=PVENAS
|
||||
n8n ansible_host=10.0.10.154 ansible_user=root url=https://n8n.levkin.ca proxmox_vmid=103 proxmox_node=PVENAS
|
||||
vaultwardenVM ansible_host=10.0.10.142 ansible_user=ladmin url=https://vault.levkin.ca proxmox_vmid=104 proxmox_node=PVENAS
|
||||
actual ansible_host=10.0.10.158 ansible_user=root url=https://budget.levkin.ca proxmox_vmid=108 proxmox_node=PVENAS
|
||||
vikanjans ansible_host=10.0.10.159 ansible_user=root url=https://todo.levkin.ca
|
||||
n8n ansible_host=10.0.10.154 ansible_user=root url=https://n8n.levkin.ca
|
||||
giteaVM ansible_host=10.0.10.169 ansible_user=root url=https://git.levkin.ca
|
||||
portainerVM ansible_host=10.0.30.69 ansible_user=ladmin
|
||||
homepageVM ansible_host=10.0.30.12 ansible_user=homepage url=https://home.levkin.ca
|
||||
vaultwardenVM ansible_host=10.0.10.142 ansible_user=ladmin url=https://vault.levkin.ca
|
||||
qBittorrent ansible_host=10.0.10.91 ansible_user=root port=8080
|
||||
cal ansible_host=10.0.10.228 ansible_user=root url=https://cal.levkin.ca
|
||||
jellyfin ansible_host=10.0.10.232 ansible_user=root url=https://jelly.levkin.ca proxmox_vmid=101 proxmox_node=PVENAS # stopped until NAS pool healthy
|
||||
|
||||
# Retired / stopped — kept for reference; do not run playbooks against these without intent
|
||||
# nextcloud ansible_host=10.0.10.24 ansible_user=root url=https://nextcloud.levkin.ca # VM 201 decommission
|
||||
# portainerVM ansible_host=10.0.30.69 ansible_user=ladmin # retired → Dockge on monitoring
|
||||
# homepageVM ansible_host=10.0.30.12 ansible_user=homepage # VM 100 stopped on pve10
|
||||
|
||||
#[desktop]
|
||||
#desktop-beast ansible_host=100.117.34.106 ansible_user=beast
|
||||
|
||||
53
playbooks/caddy-auth-authentik.yml
Normal file
53
playbooks/caddy-auth-authentik.yml
Normal file
@ -0,0 +1,53 @@
|
||||
---
|
||||
# Playbook: caddy-auth-authentik
|
||||
# Purpose: Add auth.levkin.ca reverse proxy to Caddy (Phase 1 Authentik)
|
||||
# Targets: caddy
|
||||
# Usage: make -f Makefile caddy-auth OR ansible-playbook playbooks/caddy-auth-authentik.yml
|
||||
|
||||
- name: Add Authentik proxy block to Caddy
|
||||
hosts: caddy
|
||||
become: true
|
||||
become_method: su
|
||||
|
||||
tasks:
|
||||
- name: Ensure auth.levkin.ca HTTPS block exists (after cal block)
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
if grep -q '^auth\.levkin\.ca {' /etc/caddy/Caddyfile; then
|
||||
exit 0
|
||||
fi
|
||||
awk '
|
||||
/^cal\.levkin\.ca \{/ { in_cal=1 }
|
||||
in_cal && /^}$/ && !done {
|
||||
print
|
||||
print ""
|
||||
print "auth.levkin.ca {"
|
||||
print " import security-headers"
|
||||
print " encode gzip"
|
||||
print " reverse_proxy 10.0.10.21:9000"
|
||||
print "}"
|
||||
done=1
|
||||
next
|
||||
}
|
||||
{ print }
|
||||
' /etc/caddy/Caddyfile > /tmp/Caddyfile.new
|
||||
mv /tmp/Caddyfile.new /etc/caddy/Caddyfile
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
notify: Reload caddy
|
||||
|
||||
- name: Ensure auth.levkin.ca HTTP redirect in :80 block
|
||||
ansible.builtin.blockinfile:
|
||||
path: /etc/caddy/Caddyfile
|
||||
marker: "# {mark} ANSIBLE MANAGED auth.levkin.ca :80"
|
||||
insertafter: '@vault host vault.levkin.ca'
|
||||
block: |2
|
||||
@auth host auth.levkin.ca
|
||||
redir @auth https://auth.levkin.ca{uri} permanent
|
||||
notify: Reload caddy
|
||||
|
||||
handlers:
|
||||
- name: Reload caddy
|
||||
ansible.builtin.command: caddy reload --config /etc/caddy/Caddyfile
|
||||
changed_when: true
|
||||
20
playbooks/ssh-keys.yml
Normal file
20
playbooks/ssh-keys.yml
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
# Playbook: ssh-keys
|
||||
# Purpose: Install your workstation SSH public key on all inventory hosts
|
||||
# Targets: all hosts except localhost
|
||||
# Usage: make copy-ssh-keys-ansible
|
||||
# make copy-ssh-keys-ansible GROUP=services
|
||||
# make copy-ssh-keys-ansible HOST=dev01
|
||||
|
||||
- name: Deploy workstation SSH public key
|
||||
hosts: all:!local
|
||||
gather_facts: false
|
||||
vars:
|
||||
ssh_public_key_file: "{{ lookup('env', 'SSH_PUBLIC_KEY') | default(lookup('env', 'HOME') + '/.ssh/id_ed25519.pub', true) }}"
|
||||
tasks:
|
||||
- name: Add SSH public key for ansible_user
|
||||
ansible.posix.authorized_key:
|
||||
user: "{{ ansible_user | default(ansible_user_id) }}"
|
||||
state: present
|
||||
key: "{{ lookup('file', ssh_public_key_file) }}"
|
||||
become: false
|
||||
60
scripts/bootstrap-root-ssh-su-password.sh
Executable file
60
scripts/bootstrap-root-ssh-su-password.sh
Executable file
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env bash
|
||||
# Bootstrap root SSH when `su` needs a password (no sudo on host).
|
||||
# Usage: BOOTSTRAP_SU_PASSWORD='...' ./scripts/bootstrap-root-ssh-su-password.sh HOST
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
HOST="${1:-}"
|
||||
BOOTSTRAP_USER="${BOOTSTRAP_USER:-ladmin}"
|
||||
PUBKEY_FILE="${SSH_PUBLIC_KEY:-${HOME}/.ssh/id_ed25519.pub}"
|
||||
SU_PASSWORD="${BOOTSTRAP_SU_PASSWORD:-}"
|
||||
|
||||
[[ -n "${HOST}" ]] || { echo "Usage: $0 HOST" >&2; exit 1; }
|
||||
[[ -n "${SU_PASSWORD}" ]] || { echo "Set BOOTSTRAP_SU_PASSWORD" >&2; exit 1; }
|
||||
[[ -f "${PUBKEY_FILE}" ]] || { echo "Missing ${PUBKEY_FILE}" >&2; exit 1; }
|
||||
|
||||
IP="$(awk -v h="${HOST}" '$1==h {for(i=2;i<=NF;i++) if($i~/^ansible_host=/) {sub(/ansible_host=/,"",$i); print $i; exit}}' \
|
||||
"${REPO_ROOT}/inventories/production/hosts")"
|
||||
[[ -n "${IP}" ]] || { echo "No ansible_host for ${HOST}" >&2; exit 1; }
|
||||
|
||||
PUBKEY="$(cat "${PUBKEY_FILE}")"
|
||||
export IP BOOTSTRAP_USER SU_PASSWORD PUBKEY
|
||||
|
||||
/usr/bin/expect <<'EXPECT'
|
||||
set timeout 60
|
||||
spawn ssh -o StrictHostKeyChecking=accept-new $env(BOOTSTRAP_USER)@$env(IP)
|
||||
expect {
|
||||
-re {[$#] $} { }
|
||||
timeout { exit 1 }
|
||||
}
|
||||
send "su -\r"
|
||||
expect {
|
||||
"Password:" {
|
||||
send "$env(SU_PASSWORD)\r"
|
||||
}
|
||||
timeout { exit 1 }
|
||||
}
|
||||
expect {
|
||||
-re {root@caddy|#|❯|[$#] $} { }
|
||||
timeout { exit 1 }
|
||||
}
|
||||
send "bash --noprofile --norc\r"
|
||||
expect {
|
||||
-re {# $} { }
|
||||
timeout { exit 1 }
|
||||
}
|
||||
send "mkdir -p /root/.ssh && chmod 700 /root/.ssh && touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys\r"
|
||||
expect -re {# $}
|
||||
send "grep -qF '$env(PUBKEY)' /root/.ssh/authorized_keys || echo '$env(PUBKEY)' >> /root/.ssh/authorized_keys\r"
|
||||
expect -re {# $}
|
||||
send "sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config 2>/dev/null || echo PermitRootLogin prohibit-password >> /etc/ssh/sshd_config\r"
|
||||
expect -re {# $}
|
||||
send "systemctl restart ssh 2>/dev/null || systemctl restart sshd 2>/dev/null || true\r"
|
||||
expect -re {# $}
|
||||
send "exit\r"
|
||||
expect eof
|
||||
EXPECT
|
||||
|
||||
ssh -o BatchMode=yes -i "${PUBKEY_FILE}" -o ConnectTimeout=10 \
|
||||
"root@${IP}" "echo OK: root@${IP}"
|
||||
echo "Done: root key on ${HOST}"
|
||||
103
scripts/bootstrap-root-ssh.sh
Executable file
103
scripts/bootstrap-root-ssh.sh
Executable file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env bash
|
||||
# Bootstrap root SSH key access via a normal user (default: ladmin).
|
||||
# Usage: ./scripts/bootstrap-root-ssh.sh HOSTNAME
|
||||
# BOOTSTRAP_USER=ladmin TARGET_USER=root SSH_PUBLIC_KEY=~/.ssh/id_ed25519.pub
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
INVENTORY_HOSTS="${INVENTORY_HOSTS:-${REPO_ROOT}/inventories/production/hosts}"
|
||||
PUBKEY_FILE="${SSH_PUBLIC_KEY:-${HOME}/.ssh/id_ed25519.pub}"
|
||||
BOOTSTRAP_USER="${BOOTSTRAP_USER:-ladmin}"
|
||||
TARGET_USER="${TARGET_USER:-root}"
|
||||
HOST="${1:-}"
|
||||
|
||||
if [[ -z "${HOST}" ]]; then
|
||||
echo "Usage: $0 HOST" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "${PUBKEY_FILE}" ]]; then
|
||||
echo "Public key not found: ${PUBKEY_FILE}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
resolve_from_inventory() {
|
||||
awk -v host="${HOST}" '
|
||||
$1 == host {
|
||||
for (i = 2; i <= NF; i++) {
|
||||
if ($i ~ /^ansible_host=/) {
|
||||
sub(/ansible_host=/, "", $i)
|
||||
ip = $i
|
||||
}
|
||||
if ($i ~ /^ansible_user=/) {
|
||||
sub(/ansible_user=/, "", $i)
|
||||
user = $i
|
||||
}
|
||||
}
|
||||
}
|
||||
END {
|
||||
print ip
|
||||
print user
|
||||
}
|
||||
' "${INVENTORY_HOSTS}"
|
||||
}
|
||||
|
||||
IP="$(resolve_from_inventory | sed -n '1p')"
|
||||
INV_USER="$(resolve_from_inventory | sed -n '2p')"
|
||||
|
||||
if [[ -z "${IP}" ]]; then
|
||||
echo "Could not resolve ansible_host for ${HOST} in ${INVENTORY_HOSTS}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "==> ${HOST} (${BOOTSTRAP_USER}@${IP} -> ${TARGET_USER})"
|
||||
echo " Inventory ansible_user: ${INV_USER:-<unset>}"
|
||||
echo " Public key: ${PUBKEY_FILE}"
|
||||
echo ""
|
||||
|
||||
echo "Step 1/3: Install key for ${BOOTSTRAP_USER} (password: ${BOOTSTRAP_USER})"
|
||||
ssh-copy-id -i "${PUBKEY_FILE}" -o StrictHostKeyChecking=accept-new \
|
||||
"${BOOTSTRAP_USER}@${IP}"
|
||||
|
||||
echo ""
|
||||
echo "Step 2/3: Copy key and configure ${TARGET_USER} via su (password: root)"
|
||||
REMOTE_KEY="/tmp/ansible-bootstrap.pub"
|
||||
scp -o StrictHostKeyChecking=accept-new "${PUBKEY_FILE}" \
|
||||
"${BOOTSTRAP_USER}@${IP}:${REMOTE_KEY}"
|
||||
|
||||
ssh -t "${BOOTSTRAP_USER}@${IP}" bash -s <<REMOTE_SCRIPT
|
||||
set -e
|
||||
REMOTE_KEY="${REMOTE_KEY}"
|
||||
su - root <<ROOT_SCRIPT
|
||||
set -e
|
||||
mkdir -p /root/.ssh
|
||||
chmod 700 /root/.ssh
|
||||
touch /root/.ssh/authorized_keys
|
||||
chmod 600 /root/.ssh/authorized_keys
|
||||
if ! grep -qF "\$(cat "\${REMOTE_KEY}")" /root/.ssh/authorized_keys 2>/dev/null; then
|
||||
cat "\${REMOTE_KEY}" >> /root/.ssh/authorized_keys
|
||||
fi
|
||||
rm -f "\${REMOTE_KEY}"
|
||||
if [ -f /etc/ssh/sshd_config ]; then
|
||||
if grep -q '^PermitRootLogin' /etc/ssh/sshd_config; then
|
||||
sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
||||
else
|
||||
echo 'PermitRootLogin prohibit-password' >> /etc/ssh/sshd_config
|
||||
fi
|
||||
systemctl restart ssh 2>/dev/null \
|
||||
|| systemctl restart sshd 2>/dev/null \
|
||||
|| service ssh restart 2>/dev/null \
|
||||
|| true
|
||||
fi
|
||||
echo "OK: root authorized_keys updated; PermitRootLogin prohibit-password"
|
||||
ROOT_SCRIPT
|
||||
REMOTE_SCRIPT
|
||||
|
||||
echo ""
|
||||
echo "Step 3/3: Verify ${TARGET_USER} key login"
|
||||
ssh -o BatchMode=yes -i "${PUBKEY_FILE}" -o StrictHostKeyChecking=accept-new \
|
||||
"${TARGET_USER}@${IP}" "echo OK: ${TARGET_USER}@${IP} accepts your SSH key"
|
||||
|
||||
echo ""
|
||||
echo "Done: ${HOST} — use: ssh -i ${PUBKEY_FILE} ${TARGET_USER}@${IP}"
|
||||
66
scripts/kuma-setup-smtp.sh
Executable file
66
scripts/kuma-setup-smtp.sh
Executable file
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
# Configure Uptime Kuma SMTP notification (Mailcow) via Socket.IO API.
|
||||
# Run from machine with network access to Kuma:
|
||||
# export KUMA_URL=http://10.0.10.22:3001
|
||||
# export KUMA_USER=admin
|
||||
# export KUMA_PASSWORD='your-kuma-password'
|
||||
# export SMTP_USER=alerts@levkine.ca
|
||||
# export SMTP_PASS='mailbox-password'
|
||||
# export SMTP_TO=idobkin@gmail.com
|
||||
# pip install uptime-kuma-api
|
||||
# ./scripts/kuma-setup-smtp.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
KUMA_URL="${KUMA_URL:-http://10.0.10.22:3001}"
|
||||
KUMA_USER="${KUMA_USER:-admin}"
|
||||
KUMA_PASSWORD="${KUMA_PASSWORD:-}"
|
||||
SMTP_HOST="${SMTP_HOST:-mail.levkine.ca}"
|
||||
SMTP_PORT="${SMTP_PORT:-587}"
|
||||
SMTP_USER="${SMTP_USER:-alerts@levkine.ca}"
|
||||
SMTP_PASS="${SMTP_PASS:-}"
|
||||
SMTP_TO="${SMTP_TO:-idobkin@gmail.com}"
|
||||
|
||||
if [[ -z "${KUMA_PASSWORD}" || -z "${SMTP_PASS}" ]]; then
|
||||
echo "Set KUMA_PASSWORD and SMTP_PASS" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 <<'PY'
|
||||
import os
|
||||
import sys
|
||||
|
||||
try:
|
||||
from uptime_kuma_api import UptimeKumaApi
|
||||
except ImportError:
|
||||
print("pip install uptime-kuma-api", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
url = os.environ["KUMA_URL"]
|
||||
user = os.environ["KUMA_USER"]
|
||||
password = os.environ["KUMA_PASSWORD"]
|
||||
smtp_host = os.environ["SMTP_HOST"]
|
||||
smtp_port = int(os.environ["SMTP_PORT"])
|
||||
smtp_user = os.environ["SMTP_USER"]
|
||||
smtp_pass = os.environ["SMTP_PASS"]
|
||||
smtp_to = os.environ["SMTP_TO"]
|
||||
|
||||
with UptimeKumaApi(url) as api:
|
||||
api.login(user, password)
|
||||
# Notification type name in Kuma 1.x is often 'smtp' / 'email'
|
||||
result = api.add_notification(
|
||||
name="Mailcow alerts",
|
||||
type="smtp",
|
||||
isDefault=True,
|
||||
applyExisting=True,
|
||||
smtpHost=smtp_host,
|
||||
smtpPort=smtp_port,
|
||||
smtpSecure=True,
|
||||
smtpIgnoreTLS=False,
|
||||
smtpUsername=smtp_user,
|
||||
smtpPassword=smtp_pass,
|
||||
smtpFrom=smtp_user,
|
||||
smtpTo=smtp_to,
|
||||
)
|
||||
print(result)
|
||||
PY
|
||||
51
scripts/load-mailcow-vault-env.sh
Executable file
51
scripts/load-mailcow-vault-env.sh
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
# Export Mailcow API + mailbox password from .env or Ansible vault.
|
||||
# Usage: source scripts/load-mailcow-vault-env.sh [mailbox_local_part]
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
VAULT_FILE="${REPO_ROOT}/inventories/production/group_vars/all/vault.yml"
|
||||
VAULT_PASS="${HOME}/.ansible-vault-pass"
|
||||
ANSIBLE_VAULT="${REPO_ROOT}/.venv/bin/ansible-vault"
|
||||
MAILBOX_KEY="${1:-${MAILBOX:-${MAILBOX_LOCAL_PART:-}}}"
|
||||
|
||||
set -a
|
||||
[ -f "${REPO_ROOT}/.env" ] && . "${REPO_ROOT}/.env"
|
||||
set +a
|
||||
|
||||
if [[ -n "${MAILCOW_API_KEY:-}" && -n "${MAILBOX_PASSWORD:-${ALERTS_PASSWORD:-}}" ]]; then
|
||||
export MAILBOX_PASSWORD="${MAILBOX_PASSWORD:-${ALERTS_PASSWORD:-}}"
|
||||
return 0 2>/dev/null || exit 0
|
||||
fi
|
||||
|
||||
if [[ ! -f "${VAULT_FILE}" ]] || [[ ! -f "${VAULT_PASS}" ]]; then
|
||||
return 0 2>/dev/null || exit 0
|
||||
fi
|
||||
|
||||
eval "$("${REPO_ROOT}/.venv/bin/python3" - "${VAULT_FILE}" "${VAULT_PASS}" "${ANSIBLE_VAULT}" "${MAILBOX_KEY}" <<'PY'
|
||||
import os, subprocess, sys, yaml, shlex
|
||||
|
||||
vault_file, vault_pass, ansible_vault, mailbox_key = sys.argv[1:5]
|
||||
text = subprocess.check_output(
|
||||
[ansible_vault, "view", vault_file, "--vault-password-file", vault_pass],
|
||||
text=True,
|
||||
)
|
||||
data = yaml.safe_load(text) or {}
|
||||
out = []
|
||||
api = data.get("vault_mailcow_api_key") or ""
|
||||
if api:
|
||||
out.append("export MAILCOW_API_KEY=" + shlex.quote(str(api)))
|
||||
passwords = data.get("vault_mailcow_mailbox_passwords") or {}
|
||||
pw = ""
|
||||
if mailbox_key and mailbox_key in passwords:
|
||||
pw = passwords[mailbox_key]
|
||||
elif mailbox_key == "alerts":
|
||||
pw = data.get("vault_alerts_mailbox_password") or passwords.get("alerts", "")
|
||||
if pw:
|
||||
out.append("export MAILBOX_PASSWORD=" + shlex.quote(str(pw)))
|
||||
out.append("export ALERTS_PASSWORD=" + shlex.quote(str(pw)))
|
||||
print("\n".join(out))
|
||||
PY
|
||||
)"
|
||||
|
||||
return 0 2>/dev/null || exit 0
|
||||
18
scripts/load-vault-lxc-root-password.sh
Executable file
18
scripts/load-vault-lxc-root-password.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
# Export BOOTSTRAP_SU_PASSWORD from vault_lxc_root_password
|
||||
set -euo pipefail
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
eval "$("${REPO_ROOT}/.venv/bin/python3" - "${REPO_ROOT}" <<'PY'
|
||||
import os, subprocess, sys, yaml, shlex
|
||||
repo = sys.argv[1]
|
||||
text = subprocess.check_output(
|
||||
[os.path.join(repo, ".venv/bin/ansible-vault"), "view",
|
||||
os.path.join(repo, "inventories/production/group_vars/all/vault.yml"),
|
||||
"--vault-password-file", os.path.expanduser("~/.ansible-vault-pass")],
|
||||
text=True,
|
||||
)
|
||||
pw = (yaml.safe_load(text) or {}).get("vault_lxc_root_password", "")
|
||||
if pw:
|
||||
print("export BOOTSTRAP_SU_PASSWORD=" + shlex.quote(str(pw)))
|
||||
PY
|
||||
)"
|
||||
32
scripts/mailcow-mailbox-from-inventory.sh
Executable file
32
scripts/mailcow-mailbox-from-inventory.sh
Executable file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
# Resolve MAILBOX= key from inventories/production/group_vars/all/mailcow.yml
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
MAILBOX="${MAILBOX:-}"
|
||||
[[ -n "${MAILBOX}" ]] || { echo "MAILBOX required" >&2; exit 1; }
|
||||
|
||||
eval "$("${REPO_ROOT}/.venv/bin/python3" - "${REPO_ROOT}" "${MAILBOX}" <<'PY'
|
||||
import sys, yaml, shlex, os
|
||||
|
||||
repo, key = sys.argv[1], sys.argv[2]
|
||||
path = os.path.join(repo, "inventories/production/group_vars/all/mailcow.yml")
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
boxes = data.get("mailcow_mailboxes") or {}
|
||||
if key not in boxes:
|
||||
raise SystemExit(f"Unknown MAILBOX={key!r}. Add it to mailcow_mailboxes in mailcow.yml")
|
||||
b = boxes[key]
|
||||
out = []
|
||||
for k, env in [
|
||||
("local_part", "MAILBOX_LOCAL_PART"),
|
||||
("name", "MAILBOX_NAME"),
|
||||
("quota", "MAILBOX_QUOTA"),
|
||||
]:
|
||||
if k in b and b[k] is not None:
|
||||
out.append(f"export {env}={shlex.quote(str(b[k]))}")
|
||||
if b.get("vault_password_key"):
|
||||
out.append(f"export MAILBOX_VAULT_KEY={shlex.quote(str(b['vault_password_key']))}")
|
||||
print("\n".join(out))
|
||||
PY
|
||||
)"
|
||||
62
scripts/mailcow-mailbox.sh
Executable file
62
scripts/mailcow-mailbox.sh
Executable file
@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env bash
|
||||
# Create or update a Mailcow mailbox via API.
|
||||
#
|
||||
# Usage:
|
||||
# make mailcow-mailbox MAILBOX=alerts
|
||||
# # or with env (after: source scripts/load-mailcow-vault-env.sh):
|
||||
# MAILBOX_LOCAL_PART=notify MAILBOX_NAME="Notify" MAILBOX_PASSWORD='...' ./scripts/mailcow-mailbox.sh
|
||||
#
|
||||
# Variables (env or make):
|
||||
# MAILBOX / MAILBOX_LOCAL_PART — local part (required)
|
||||
# MAILBOX_NAME — display name (default: title-case of local part)
|
||||
# MAILBOX_PASSWORD — if unset, loaded from vault_mailcow_mailbox_passwords[local_part]
|
||||
# MAILBOX_QUOTA — MiB (default 1024)
|
||||
# MAILCOW_URL, MAILCOW_DOMAIN, MAILCOW_API_KEY — see load-mailcow-vault-env.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MAILCOW_URL="${MAILCOW_URL:-https://mail.levkine.ca}"
|
||||
DOMAIN="${MAILCOW_DOMAIN:-levkine.ca}"
|
||||
LOCAL_PART="${MAILBOX_LOCAL_PART:-${MAILBOX:-}}"
|
||||
API_KEY="${MAILCOW_API_KEY:-}"
|
||||
MAILBOX_PASSWORD="${MAILBOX_PASSWORD:-${ALERTS_PASSWORD:-}}"
|
||||
QUOTA="${MAILBOX_QUOTA:-1024}"
|
||||
|
||||
if [[ -z "${LOCAL_PART}" ]]; then
|
||||
echo "Set MAILBOX=localpart or MAILBOX_LOCAL_PART" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${API_KEY}" ]]; then
|
||||
echo "Set MAILCOW_API_KEY (make mailcow-mailbox loads vault/.env)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${MAILBOX_PASSWORD}" ]]; then
|
||||
echo "Set MAILBOX_PASSWORD or add vault_mailcow_mailbox_passwords.${LOCAL_PART} in vault" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DISPLAY_NAME="${MAILBOX_NAME:-$(echo "${LOCAL_PART}" | sed 's/[-_]/ /g' | awk '{for(i=1;i<=NF;i++) $i=toupper(substr($i,1,1)) tolower(substr($i,2)); print}')}"
|
||||
|
||||
ATTR=$(jq -nc \
|
||||
--arg lp "${LOCAL_PART}" \
|
||||
--arg dom "${DOMAIN}" \
|
||||
--arg name "${DISPLAY_NAME}" \
|
||||
--arg pw "${MAILBOX_PASSWORD}" \
|
||||
--arg quota "${QUOTA}" \
|
||||
'{local_part:$lp,domain:$dom,name:$name,quota:$quota,password:$pw,password2:$pw,active:"1"}')
|
||||
|
||||
echo "Creating mailbox ${LOCAL_PART}@${DOMAIN} (${DISPLAY_NAME})..."
|
||||
RESP=$(curl -sk -w "\n%{http_code}" -X POST "${MAILCOW_URL}/api/v1/add/mailbox" \
|
||||
-H "X-API-Key: ${API_KEY}" \
|
||||
-d "attr=${ATTR}")
|
||||
HTTP_CODE=$(echo "${RESP}" | tail -1)
|
||||
BODY=$(echo "${RESP}" | sed '$d')
|
||||
echo "${BODY}" | jq . 2>/dev/null || echo "${BODY}"
|
||||
if [[ "${HTTP_CODE}" -lt 200 || "${HTTP_CODE}" -ge 300 ]]; then
|
||||
echo "Mailcow API HTTP ${HTTP_CODE}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Done: ${LOCAL_PART}@${DOMAIN}"
|
||||
17
scripts/run-mailcow-mailbox.sh
Executable file
17
scripts/run-mailcow-mailbox.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
# Wrapper for: make mailcow-mailbox MAILBOX=name
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
MAILBOX="${MAILBOX:?MAILBOX required}"
|
||||
|
||||
cd "${REPO_ROOT}"
|
||||
eval "$(./scripts/mailcow-mailbox-from-inventory.sh)"
|
||||
. ./scripts/load-mailcow-vault-env.sh "${MAILBOX_VAULT_KEY:-${MAILBOX}}"
|
||||
|
||||
if [[ -z "${MAILCOW_API_KEY:-}" || -z "${MAILBOX_PASSWORD:-}" ]]; then
|
||||
echo "Missing vault_mailcow_api_key or vault_mailcow_mailbox_passwords.${MAILBOX}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exec ./scripts/mailcow-mailbox.sh
|
||||
39
scripts/security-audit-lxc-via-pve.sh
Executable file
39
scripts/security-audit-lxc-via-pve.sh
Executable file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
# Audit LXCs on a Proxmox node via pct exec (run ON the PVE host as root).
|
||||
set -u
|
||||
|
||||
AUDIT='#!/bin/bash
|
||||
echo "=== identity ==="
|
||||
hostname -f 2>/dev/null || hostname
|
||||
[ -f /etc/os-release ] && . /etc/os-release && echo "os=${PRETTY_NAME:-unknown}"
|
||||
echo "ip=$(hostname -I 2>/dev/null | awk "{print \$1}")"
|
||||
echo "=== sshd (effective) ==="
|
||||
if command -v sshd >/dev/null 2>&1; then
|
||||
sshd -T 2>/dev/null | grep -E "^(permitrootlogin|passwordauthentication|pubkeyauthentication|permitemptypasswords|port) " || true
|
||||
else
|
||||
grep -E "^(PermitRootLogin|PasswordAuthentication|PubkeyAuthentication|Port) " /etc/ssh/sshd_config 2>/dev/null | grep -v "^#" || echo "sshd not installed"
|
||||
fi
|
||||
echo "=== firewall ==="
|
||||
ufw status 2>/dev/null | head -3 || echo "no ufw"
|
||||
echo "=== fail2ban ==="
|
||||
systemctl is-active fail2ban 2>/dev/null || echo "inactive/missing"
|
||||
echo "=== pending upgrades ==="
|
||||
apt-get -s upgrade 2>/dev/null | grep -c "^Inst" || echo 0
|
||||
echo "=== public listeners ==="
|
||||
ss -tlnp 2>/dev/null | grep LISTEN | grep -v "127.0.0.1:" | grep -v "\[::1\]:" | head -12
|
||||
'
|
||||
|
||||
echo "PVE_NODE=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "PVE_IP=$(hostname -I | awk '{print $1}')"
|
||||
|
||||
for id in $(pct list 2>/dev/null | awk 'NR>1 {print $1}'); do
|
||||
name=$(pct list | awk -v id="$id" '$1==id {print $4}')
|
||||
status=$(pct list | awk -v id="$id" '$1==id {print $2}')
|
||||
echo ""
|
||||
echo "######## LXC vmid=$id name=$name status=$status ########"
|
||||
if [ "$status" != "running" ]; then
|
||||
echo "SKIP: not running"
|
||||
continue
|
||||
fi
|
||||
pct exec "$id" -- bash -c "$AUDIT" 2>&1 || echo "ERROR: pct exec failed"
|
||||
done
|
||||
48
scripts/security-audit-remote.sh
Executable file
48
scripts/security-audit-remote.sh
Executable file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
# Quick read-only security snapshot (run on target host).
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== identity ==="
|
||||
hostname -f 2>/dev/null || hostname
|
||||
if [ -f /etc/os-release ]; then . /etc/os-release; echo "os=${PRETTY_NAME:-unknown}"; fi
|
||||
echo "kernel=$(uname -r)"
|
||||
echo "uptime=$(uptime -p 2>/dev/null || uptime)"
|
||||
|
||||
echo "=== sshd (effective) ==="
|
||||
if command -v sshd >/dev/null 2>&1; then
|
||||
sshd -T 2>/dev/null | grep -E '^(permitrootlogin|passwordauthentication|pubkeyauthentication|permitemptypasswords|port|x11forwarding|allowtcpforwarding) ' || true
|
||||
else
|
||||
grep -E '^(PermitRootLogin|PasswordAuthentication|PubkeyAuthentication|Port) ' /etc/ssh/sshd_config 2>/dev/null | grep -v '^#' || echo "sshd not found"
|
||||
fi
|
||||
|
||||
echo "=== firewall ==="
|
||||
if command -v ufw >/dev/null 2>&1; then
|
||||
ufw status verbose 2>/dev/null | head -8
|
||||
elif command -v firewall-cmd >/dev/null 2>&1; then
|
||||
firewall-cmd --state 2>/dev/null || true
|
||||
else
|
||||
echo "no ufw/firewalld"
|
||||
fi
|
||||
|
||||
echo "=== fail2ban ==="
|
||||
systemctl is-active fail2ban 2>/dev/null || echo "fail2ban: inactive or missing"
|
||||
|
||||
echo "=== unattended-upgrades ==="
|
||||
systemctl is-active unattended-upgrades 2>/dev/null || echo "unattended-upgrades: inactive or missing"
|
||||
|
||||
echo "=== pending apt upgrades ==="
|
||||
if command -v apt >/dev/null 2>&1; then
|
||||
apt-get -s upgrade 2>/dev/null | grep -c '^Inst' || echo 0
|
||||
else
|
||||
echo "n/a"
|
||||
fi
|
||||
|
||||
echo "=== listening tcp (public) ==="
|
||||
ss -tlnp 2>/dev/null | awk 'NR==1 || /LISTEN/ {print}' | grep -v '127.0.0.1:' | grep -v '\[::1\]:' | head -20
|
||||
|
||||
echo "=== uid 0 accounts ==="
|
||||
awk -F: '$3==0 {print $1}' /etc/passwd | tr '\n' ' '
|
||||
echo
|
||||
|
||||
echo "=== last logins (top 5) ==="
|
||||
last -n 5 2>/dev/null | head -5 || true
|
||||
27
scripts/security-audit-ssh.sh
Executable file
27
scripts/security-audit-ssh.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
# SSH-focused audit (hypervisor or guest).
|
||||
set -u
|
||||
|
||||
echo "=== host ==="
|
||||
hostname -f 2>/dev/null || hostname
|
||||
|
||||
echo "=== sshd effective config ==="
|
||||
if command -v sshd >/dev/null 2>&1; then
|
||||
sshd -T 2>/dev/null | grep -E '^(port|permitrootlogin|passwordauthentication|pubkeyauthentication|permitemptypasswords|maxauthtries|x11forwarding|allowtcpforwarding|gatewayports|permittunnel|usepam|kbdinteractiveauthentication) ' || true
|
||||
else
|
||||
echo "sshd binary missing"
|
||||
fi
|
||||
|
||||
echo "=== sshd_config (non-comment) ==="
|
||||
grep -E '^(Port|PermitRootLogin|PasswordAuthentication|PubkeyAuthentication|PermitEmptyPasswords|MaxAuthTries|AllowUsers|AllowGroups|X11Forwarding) ' /etc/ssh/sshd_config 2>/dev/null || true
|
||||
|
||||
echo "=== authorized_keys (root) ==="
|
||||
if [ -f /root/.ssh/authorized_keys ]; then
|
||||
wc -l /root/.ssh/authorized_keys
|
||||
awk '{print $NF}' /root/.ssh/authorized_keys 2>/dev/null | sed 's/^/ key: /'
|
||||
else
|
||||
echo "no /root/.ssh/authorized_keys"
|
||||
fi
|
||||
|
||||
echo "=== recent ssh auth failures (today) ==="
|
||||
journalctl -u ssh -u sshd --since today 2>/dev/null | grep -iE 'Failed|Invalid|refused' | tail -5 || grep -iE 'Failed|Invalid' /var/log/auth.log 2>/dev/null | tail -5 || echo "no logs"
|
||||
81
scripts/vault-export-env.sh
Executable file
81
scripts/vault-export-env.sh
Executable file
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env bash
|
||||
# Write Ansible vault secrets into .env (for local scripts / reference).
|
||||
# Does not print secret values. Does not overwrite non-empty .env keys.
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
ENV_FILE="${1:-${REPO_ROOT}/.env}"
|
||||
VAULT_FILE="${REPO_ROOT}/inventories/production/group_vars/all/vault.yml"
|
||||
VAULT_PASS="${HOME}/.ansible-vault-pass"
|
||||
ANSIBLE_VAULT="${REPO_ROOT}/.venv/bin/ansible-vault"
|
||||
|
||||
[[ -f "${VAULT_PASS}" ]] || { echo "Missing ${VAULT_PASS}" >&2; exit 1; }
|
||||
|
||||
"${REPO_ROOT}/.venv/bin/python3" - "${ENV_FILE}" "${VAULT_FILE}" "${VAULT_PASS}" "${ANSIBLE_VAULT}" <<'PY'
|
||||
import subprocess, sys, yaml
|
||||
from pathlib import Path
|
||||
|
||||
env_file, vault_file, vault_pass, ansible_vault = sys.argv[1:5]
|
||||
|
||||
# vault key -> .env key
|
||||
MAP = {
|
||||
"vault_mailcow_api_key": "MAILCOW_API_KEY",
|
||||
"vault_alerts_mailbox_password": "ALERTS_PASSWORD",
|
||||
"vault_uptime_kuma_password": "KUMA_PASSWORD",
|
||||
"vault_uptime_kuma_user": "KUMA_USER",
|
||||
"vault_uptime_kuma_url": "KUMA_URL",
|
||||
"vault_umami_admin_password": "UMAMI_ADMIN_PASSWORD",
|
||||
"vault_umami_db_password": "UMAMI_DB_PASS",
|
||||
"vault_umami_app_secret": "UMAMI_APP_SECRET",
|
||||
"vault_kuma_smtp_host": "SMTP_HOST",
|
||||
"vault_kuma_smtp_port": "SMTP_PORT",
|
||||
"vault_kuma_smtp_user": "SMTP_USER",
|
||||
"vault_kuma_smtp_password": "SMTP_PASS",
|
||||
"vault_kuma_smtp_to": "SMTP_TO",
|
||||
"vault_mattermost_url": "MATTERMOST_URL",
|
||||
"vault_mattermost_token": "MATTERMOST_TOKEN",
|
||||
"vault_mattermost_allowed_users": "MATTERMOST_ALLOWED_USERS",
|
||||
}
|
||||
|
||||
def parse_env(text):
|
||||
d = {}
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
k, _, v = line.partition("=")
|
||||
d[k.strip()] = v.strip().strip("'").strip('"')
|
||||
return d
|
||||
|
||||
text = subprocess.check_output(
|
||||
[ansible_vault, "view", vault_file, "--vault-password-file", vault_pass],
|
||||
text=True,
|
||||
)
|
||||
data = yaml.safe_load(text) or {}
|
||||
existing = parse_env(Path(env_file).read_text()) if Path(env_file).exists() else {}
|
||||
merged = dict(existing)
|
||||
|
||||
for vk, ek in MAP.items():
|
||||
val = data.get(vk)
|
||||
if val is None or val == "":
|
||||
continue
|
||||
if merged.get(ek):
|
||||
continue
|
||||
merged[ek] = str(val)
|
||||
|
||||
pw = data.get("vault_mailcow_mailbox_passwords") or {}
|
||||
if pw.get("alerts") and not merged.get("ALERTS_PASSWORD"):
|
||||
merged["ALERTS_PASSWORD"] = str(pw["alerts"])
|
||||
|
||||
header = """# Merged from Ansible vault (make vault-export-env). Fill gaps manually.
|
||||
# vault → .env: make vault-export-env
|
||||
# .env → vault: make vault-import-env
|
||||
# hosts → .env → vault: make vault-pull-infra-secrets
|
||||
|
||||
"""
|
||||
body = "\n".join(f"{k}={v}" for k, v in sorted(merged.items())) + "\n"
|
||||
Path(env_file).write_text(header + body)
|
||||
print(f"Wrote {len(merged)} keys to {env_file} (existing non-empty keys kept)")
|
||||
PY
|
||||
|
||||
chmod 600 "${ENV_FILE}" 2>/dev/null || true
|
||||
96
scripts/vault-import-env.sh
Executable file
96
scripts/vault-import-env.sh
Executable file
@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env bash
|
||||
# Merge .env into inventories/production/group_vars/all/vault.yml
|
||||
# Usage: make vault-import-env [ENV_FILE=.env]
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
ENV_FILE="${1:-${ENV_FILE:-${REPO_ROOT}/.env}}"
|
||||
VAULT_FILE="${REPO_ROOT}/inventories/production/group_vars/all/vault.yml"
|
||||
VAULT_PASS="${HOME}/.ansible-vault-pass"
|
||||
ANSIBLE_VAULT="${REPO_ROOT}/.venv/bin/ansible-vault"
|
||||
|
||||
[[ -f "${ENV_FILE}" ]] || { echo "No env file: ${ENV_FILE}" >&2; exit 1; }
|
||||
[[ -f "${VAULT_PASS}" ]] || { echo "Missing ${VAULT_PASS}" >&2; exit 1; }
|
||||
|
||||
"${REPO_ROOT}/.venv/bin/python3" - "${ENV_FILE}" "${VAULT_FILE}" "${VAULT_PASS}" "${ANSIBLE_VAULT}" <<'PY'
|
||||
import os, re, subprocess, sys, tempfile, yaml
|
||||
|
||||
env_file, vault_file, vault_pass, ansible_vault = sys.argv[1:5]
|
||||
|
||||
def load_env(path):
|
||||
out = {}
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if line.startswith("export "):
|
||||
line = line[7:].strip()
|
||||
if "=" not in line:
|
||||
continue
|
||||
k, _, v = line.partition("=")
|
||||
v = v.strip().strip("'").strip('"')
|
||||
if v:
|
||||
out[k.strip()] = v
|
||||
return out
|
||||
|
||||
# .env key -> vault key (or vault_mailcow_mailbox_passwords.<name>)
|
||||
MAP = {
|
||||
"MAILCOW_API_KEY": "vault_mailcow_api_key",
|
||||
"ALERTS_PASSWORD": ("vault_alerts_mailbox_password", "alerts"),
|
||||
"KUMA_PASSWORD": "vault_uptime_kuma_password",
|
||||
"KUMA_USER": "vault_uptime_kuma_user",
|
||||
"KUMA_URL": "vault_uptime_kuma_url",
|
||||
"UMAMI_ADMIN_PASSWORD": "vault_umami_admin_password",
|
||||
"UMAMI_DB_PASS": "vault_umami_db_password",
|
||||
"UMAMI_APP_SECRET": "vault_umami_app_secret",
|
||||
"SMTP_HOST": "vault_kuma_smtp_host",
|
||||
"SMTP_PORT": "vault_kuma_smtp_port",
|
||||
"SMTP_USER": "vault_kuma_smtp_user",
|
||||
"SMTP_PASS": "vault_kuma_smtp_password",
|
||||
"SMTP_TO": "vault_kuma_smtp_to",
|
||||
"MATTERMOST_URL": "vault_mattermost_url",
|
||||
"MATTERMOST_TOKEN": "vault_mattermost_token",
|
||||
"MATTERMOST_ALLOWED_USERS": "vault_mattermost_allowed_users",
|
||||
"PROXMOX_PASSWORD": "vault_proxmox_password",
|
||||
"LXC_ROOT_PASSWORD": "vault_lxc_root_password",
|
||||
}
|
||||
|
||||
env = load_env(env_file)
|
||||
text = subprocess.check_output(
|
||||
[ansible_vault, "view", vault_file, "--vault-password-file", vault_pass],
|
||||
text=True,
|
||||
)
|
||||
data = yaml.safe_load(text) or {}
|
||||
passwords = dict(data.get("vault_mailcow_mailbox_passwords") or {})
|
||||
|
||||
for k, v in env.items():
|
||||
m = re.match(r"^MAILBOX_(.+)_PASSWORD$", k, re.I)
|
||||
if m:
|
||||
passwords[m.group(1).lower()] = v
|
||||
continue
|
||||
target = MAP.get(k)
|
||||
if not target:
|
||||
continue
|
||||
if isinstance(target, tuple):
|
||||
data[target[0]] = v
|
||||
passwords[target[1]] = v
|
||||
else:
|
||||
data[target] = v
|
||||
|
||||
if passwords:
|
||||
data["vault_mailcow_mailbox_passwords"] = passwords
|
||||
|
||||
fd, tmp = tempfile.mkstemp(suffix=".yml")
|
||||
os.close(fd)
|
||||
with open(tmp, "w") as f:
|
||||
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
||||
|
||||
subprocess.run(
|
||||
[ansible_vault, "encrypt", tmp, "--output", vault_file,
|
||||
"--vault-password-file", vault_pass, "--encrypt-vault-id", "default"],
|
||||
check=True,
|
||||
)
|
||||
os.remove(tmp)
|
||||
print(f"Updated {vault_file} from {env_file} ({len(env)} values)")
|
||||
PY
|
||||
70
scripts/vault-pull-infra-secrets.sh
Executable file
70
scripts/vault-pull-infra-secrets.sh
Executable file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env bash
|
||||
# Pull secrets from live hosts into .env, then vault-import-env.
|
||||
# Does not print secret values.
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
ENV_FILE="${REPO_ROOT}/.env"
|
||||
|
||||
python3 - "${ENV_FILE}" <<'PY'
|
||||
import subprocess, sys
|
||||
from pathlib import Path
|
||||
|
||||
out = Path(sys.argv[1])
|
||||
lines = []
|
||||
|
||||
def sh(cmd):
|
||||
return subprocess.check_output(cmd, shell=True, text=True).strip()
|
||||
|
||||
def parse_env(text):
|
||||
d = {}
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
k, _, v = line.partition("=")
|
||||
d[k.strip()] = v.strip().strip("'").strip('"')
|
||||
return d
|
||||
|
||||
# monitoring LXC
|
||||
try:
|
||||
raw = sh("ssh -o BatchMode=yes -o ConnectTimeout=8 root@10.0.10.22 'cat /opt/monitoring/.env 2>/dev/null'")
|
||||
m = parse_env(raw)
|
||||
if m.get("UMAMI_DB_PASS"):
|
||||
lines.append(f"UMAMI_DB_PASS={m['UMAMI_DB_PASS']}")
|
||||
if m.get("UMAMI_APP_SECRET"):
|
||||
lines.append(f"UMAMI_APP_SECRET={m['UMAMI_APP_SECRET']}")
|
||||
except Exception as e:
|
||||
print(f"# skip monitoring: {e}", file=sys.stderr)
|
||||
|
||||
# hermes mattermost
|
||||
try:
|
||||
raw = sh("ssh -o BatchMode=yes -o ConnectTimeout=8 ladmin@10.0.10.36 \"sudo cat /home/hermes/.hermes/secrets/mattermost.env 2>/dev/null\"")
|
||||
h = parse_env(raw)
|
||||
for k in ("MATTERMOST_URL", "MATTERMOST_TOKEN", "MATTERMOST_ALLOWED_USERS"):
|
||||
if h.get(k):
|
||||
lines.append(f"{k}={h[k]}")
|
||||
except Exception as e:
|
||||
print(f"# skip hermes: {e}", file=sys.stderr)
|
||||
|
||||
# merge with existing .env (preserve user-filled keys)
|
||||
existing = {}
|
||||
if out.exists():
|
||||
existing = parse_env(out.read_text())
|
||||
|
||||
merged = {**existing}
|
||||
for line in lines:
|
||||
k, _, v = line.partition("=")
|
||||
merged[k] = v
|
||||
|
||||
header = """# Auto-merged by scripts/vault-pull-infra-secrets.sh + your edits
|
||||
# Run: make vault-import-env
|
||||
|
||||
"""
|
||||
body = "\n".join(f"{k}={v}" for k, v in sorted(merged.items())) + "\n"
|
||||
out.write_text(header + body)
|
||||
print(f"Wrote {len(merged)} keys to {out}")
|
||||
PY
|
||||
|
||||
chmod 600 "${ENV_FILE}" 2>/dev/null || true
|
||||
"${REPO_ROOT}/scripts/vault-import-env.sh" "${ENV_FILE}"
|
||||
Loading…
x
Reference in New Issue
Block a user