diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..dc95b44 --- /dev/null +++ b/.env.example @@ -0,0 +1,37 @@ +# Copy to .env (gitignored): cp .env.example .env +# +# vault → .env: make vault-export-env +# .env → vault: make vault-import-env +# hosts → vault: make vault-pull-infra-secrets (SSH to monitoring/hermes, then import) +# +# Prefer vault for long-term storage; delete .env after export if you want. + +# Mailcow (make mailcow-mailbox MAILBOX=alerts) +MAILCOW_API_KEY= +ALERTS_PASSWORD= + +# Uptime Kuma @ 10.0.10.22:3001 (scripts/kuma-setup-smtp.sh) +KUMA_URL=http://10.0.10.22:3001 +KUMA_USER=admin +KUMA_PASSWORD= + +# Kuma SMTP notification (after alerts@ mailbox exists) +SMTP_HOST=mail.levkine.ca +SMTP_PORT=587 +SMTP_USER=alerts@levkine.ca +SMTP_PASS= +SMTP_TO=idobkin@gmail.com + +# Umami @ 10.0.10.22:3000 (admin UI password; DB pass is on LXC only) +UMAMI_ADMIN_PASSWORD= + +# Hermes Mattermost (not Telegram) +MATTERMOST_URL= +MATTERMOST_TOKEN= +MATTERMOST_ALLOWED_USERS= + +# Optional: same password on Proxmox / LXCs / caddy root (if you use one shared admin password) +# PROXMOX_PASSWORD= +# LXC_ROOT_PASSWORD= + +# Per-mailbox: MAILBOX_notify_PASSWORD= diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 0843136..a1f86b6 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -65,7 +65,7 @@ jobs: runs-on: ubuntu-latest if: needs.skip-ci-check.outputs.should-skip != '1' && (github.event_name == 'pull_request' || github.ref == 'refs/heads/master') container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 @@ -84,12 +84,26 @@ jobs: needs: skip-ci-check runs-on: ubuntu-latest if: needs.skip-ci-check.outputs.should-skip != '1' && (github.event_name == 'pull_request' || github.ref == 'refs/heads/master') + env: + PIP_NO_CACHE_DIR: "1" + PIP_BREAK_SYSTEM_PACKAGES: "1" container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 + - name: Bootstrap pip (PEP 668 / bookworm) + run: | + python3 --version + if ! python3 -m pip --version >/dev/null 2>&1; then + curl -fsSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python3 /tmp/get-pip.py --disable-pip-version-check --break-system-packages + fi + + - name: Show disk space (runner may be full) + run: df -h / /tmp || true + - name: Configure CI Ansible (no vault, localhost inventory) run: | set -e @@ -98,12 +112,13 @@ jobs: localhost ansible_connection=local EOF - cat > /tmp/ci-ansible.cfg <<'EOF' + cat > /tmp/ci-ansible.cfg <> "$GITHUB_ENV" - name: Install Ansible and linting tools - run: pip3 install --no-cache-dir ansible ansible-lint yamllint pyyaml - - - name: Install Ansible collections run: | + python3 -m pip install --no-cache-dir ansible-core ansible-lint yamllint pyyaml ansible-galaxy collection install -r collections/requirements.yml + rm -rf /root/.cache/pip /tmp/pip-* 2>/dev/null || true - name: Validate YAML syntax run: | echo "Checking YAML syntax..." - find . -name "*.yml" -o -name "*.yaml" | grep -v ".git" | while read file; do - python3 -c "import yaml; yaml.safe_load(open('$file'))" || exit 1 - done + find . \( -name "*.yml" -o -name "*.yaml" \) \ + ! -path "./.git/*" \ + ! -path "./node_modules/*" \ + ! -path "./.venv/*" \ + ! -name "vault.yml" \ + ! -name "vault.yaml" \ + ! -name "vault_*.yml" \ + ! -name "vault_*.yaml" \ + | while read -r file; do + if head -n 5 "$file" | grep -q '^\$ANSIBLE_VAULT'; then + echo "Skipping encrypted vault file: $file" + continue + fi + python3 -c "import yaml; yaml.safe_load(open('$file'))" || exit 1 + done - name: Run ansible-lint run: ansible-lint @@ -136,7 +162,7 @@ jobs: if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 @@ -154,8 +180,11 @@ jobs: needs: skip-ci-check if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest + env: + PIP_NO_CACHE_DIR: "1" + PIP_BREAK_SYSTEM_PACKAGES: "1" container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 @@ -173,8 +202,12 @@ jobs: - name: Scan Python dependencies run: | if [ -f requirements.txt ]; then - pip3 install --no-cache-dir pip-audit - pip-audit -r requirements.txt + if ! python3 -m pip --version >/dev/null 2>&1; then + curl -fsSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python3 /tmp/get-pip.py --disable-pip-version-check --break-system-packages + fi + python3 -m pip install --no-cache-dir pip-audit + python3 -m pip-audit -r requirements.txt else echo "No requirements.txt, skipping pip-audit" fi @@ -184,14 +217,25 @@ jobs: needs: skip-ci-check if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest + env: + PIP_NO_CACHE_DIR: "1" + PIP_BREAK_SYSTEM_PACKAGES: "1" container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 + - name: Bootstrap pip (PEP 668 / bookworm) + run: | + python3 --version + if ! python3 -m pip --version >/dev/null 2>&1; then + curl -fsSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python3 /tmp/get-pip.py --disable-pip-version-check --break-system-packages + fi + - name: Install Semgrep - run: pip3 install --no-cache-dir semgrep + run: python3 -m pip install --no-cache-dir semgrep - name: Run Semgrep scan run: semgrep --config=auto --error @@ -202,7 +246,7 @@ jobs: if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 @@ -224,14 +268,24 @@ jobs: needs: skip-ci-check if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest + env: + PIP_NO_CACHE_DIR: "1" + PIP_BREAK_SYSTEM_PACKAGES: "1" container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 + - name: Bootstrap pip (PEP 668 / bookworm) + run: | + if ! python3 -m pip --version >/dev/null 2>&1; then + curl -fsSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python3 /tmp/get-pip.py --disable-pip-version-check --break-system-packages + fi + - name: Install Ansible - run: pip3 install --no-cache-dir ansible + run: python3 -m pip install --no-cache-dir ansible-core - name: Validate vault files are encrypted run: | @@ -268,12 +322,22 @@ jobs: needs: skip-ci-check if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest + env: + PIP_NO_CACHE_DIR: "1" + PIP_BREAK_SYSTEM_PACKAGES: "1" container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 + - name: Bootstrap pip (PEP 668 / bookworm) + run: | + if ! python3 -m pip --version >/dev/null 2>&1; then + curl -fsSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python3 /tmp/get-pip.py --disable-pip-version-check --break-system-packages + fi + - name: Configure CI Ansible (no vault, localhost inventory) run: | set -e @@ -298,14 +362,27 @@ jobs: [local] localhost ansible_connection=local + + [sites] + localhost ansible_connection=local + + [comms] + localhost ansible_connection=local + + [proxmox] + localhost ansible_connection=local + + [caddy] + localhost ansible_connection=local EOF - cat > /tmp/ci-ansible.cfg <<'EOF' + cat > /tmp/ci-ansible.cfg <> "$GITHUB_ENV" - name: Install Ansible - run: pip3 install --no-cache-dir ansible - - - name: Install Ansible collections run: | + python3 -m pip install --no-cache-dir ansible-core ansible-galaxy collection install -r collections/requirements.yml + rm -rf /root/.cache/pip /tmp/pip-* 2>/dev/null || true - name: Validate playbooks (CI inventory, no vault) run: | @@ -352,12 +428,13 @@ jobs: if: needs.skip-ci-check.outputs.should-skip != '1' runs-on: ubuntu-latest container: - image: node:20-bullseye + image: node:20-bookworm steps: - name: Check out code uses: actions/checkout@v4 - name: Install Trivy + continue-on-error: true run: | set -e # Use a fixed, known-good Trivy version to avoid URL/redirect issues @@ -415,18 +492,13 @@ jobs: needs: skip-ci-check if: needs.skip-ci-check.outputs.should-skip != '1' && (github.event_name == 'pull_request' || github.ref == 'refs/heads/master') runs-on: ubuntu-latest + continue-on-error: true container: - image: sonarsource/sonar-scanner-cli:5.0.1.3006 + image: sonarsource/sonar-scanner-cli:latest env: SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }} SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} steps: - - name: Install Node.js for checkout action - run: apk add --no-cache nodejs npm curl - - - name: Check out code - uses: actions/checkout@v4 - - name: Verify SonarQube connection run: | echo "Checking SonarQube connectivity..." diff --git a/.gitignore b/.gitignore index c757996..9e95aa7 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,9 @@ id_rsa id_ed25519 id_ecdsa +# Python venv (make bootstrap) +.venv/ + # Python bytecode __pycache__/ *.py[cod] @@ -34,4 +37,11 @@ Thumbs.db .ansible/facts/ +# Local data exports (Nextcloud, etc.) +exports/ + +# Local secrets (Mailcow API, Kuma passwords) — never commit +.env +.env.local + node_modules/ \ No newline at end of file diff --git a/.markdownlint.json b/.markdownlint.json index 988510f..15be465 100644 --- a/.markdownlint.json +++ b/.markdownlint.json @@ -1,7 +1,7 @@ { "default": true, "MD013": { - "line_length": 160, + "line_length": 400, "code_blocks": false, "tables": false }, @@ -13,6 +13,8 @@ "MD034": false, "MD040": false, "MD047": false, - "MD058": false + "MD058": false, + "MD060": false, + "MD036": false } diff --git a/Makefile b/Makefile index e578222..b9bbee7 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help bootstrap lint test check dev datascience inventory inventory-all local servers workstations clean status tailscale tailscale-check tailscale-dev tailscale-status create-vault create-vm monitoring +.PHONY: help bootstrap lint test check dev datascience inventory inventory-all local servers workstations clean status tailscale tailscale-check tailscale-dev tailscale-status create-vault create-vm monitoring copy-ssh-key copy-ssh-keys copy-ssh-keys-ansible copy-ssh-key-mailcow bootstrap-root-ssh bootstrap-root-ssh-services bootstrap-root-ssh-failed mailcow-mailbox mailcow-create-alerts vault-import-env .DEFAULT_GOAL := help ## Colors for output @@ -28,13 +28,27 @@ PYTHON_REQ := requirements.txt INVENTORY := inventories/production INVENTORY_HOSTS := $(INVENTORY)/hosts +# Python venv (created by `make bootstrap`) +VENV := .venv +ifneq ($(wildcard $(VENV)/bin/ansible-playbook),) +export PATH := $(abspath $(VENV)/bin):$(PATH) +ANSIBLE_VAULT := $(abspath $(VENV))/bin/ansible-vault +else +ANSIBLE_VAULT := ansible-vault +endif + # Common ansible-playbook command with options ANSIBLE_PLAYBOOK := ansible-playbook -i $(INVENTORY) ANSIBLE_ARGS := --vault-password-file ~/.ansible-vault-pass # Note: sudo passwords are in vault files as ansible_become_password ## Auto-detect current host to exclude from remote operations -CURRENT_IP := $(shell hostname -I | awk '{print $$1}') +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) +CURRENT_IP := $(shell ipconfig getifaddr en0 2>/dev/null || ipconfig getifaddr en1 2>/dev/null || echo "") +else +CURRENT_IP := $(shell hostname -I 2>/dev/null | awk '{print $$1}') +endif # NOTE: inventory parsing may require vault secrets. Keep this best-effort and silent in CI. CURRENT_HOST := $(shell ansible-inventory --list --vault-password-file ~/.ansible-vault-pass 2>/dev/null | jq -r '._meta.hostvars | to_entries[] | select(.value.ansible_host == "$(CURRENT_IP)") | .key' 2>/dev/null | head -1) EXCLUDE_CURRENT := $(if $(CURRENT_HOST),--limit '!$(CURRENT_HOST)',) @@ -59,37 +73,36 @@ help: ## Show this help message @echo " make maintenance-verbose GROUP=dev # Verbose maintenance on dev group" @echo "" +require-ansible: ## Verify ansible is available (run make bootstrap if missing) + @command -v ansible-playbook >/dev/null 2>&1 && command -v ansible-vault >/dev/null 2>&1 || { \ + echo "$(RED)ansible-playbook/ansible-vault not found$(RESET)"; \ + echo "Run: $(BLUE)make bootstrap$(RESET)"; \ + exit 1; \ + } + bootstrap: ## Install all project dependencies from requirements files @echo "$(BOLD)Installing Project Dependencies$(RESET)" @echo "" - @echo "$(YELLOW)Python Requirements ($(PYTHON_REQ)):$(RESET)" - @if [ -f "$(PYTHON_REQ)" ]; then \ - if command -v pipx >/dev/null 2>&1; then \ - printf " %-30s " "Installing with pipx"; \ - if pipx install -r $(PYTHON_REQ) >/dev/null 2>&1; then \ - echo "$(GREEN)✓ Installed$(RESET)"; \ - else \ - echo "$(YELLOW)⚠ Some packages may have failed$(RESET)"; \ - fi; \ - elif command -v pip3 >/dev/null 2>&1; then \ - printf " %-30s " "Installing with pip3 --user"; \ - if pip3 install --user -r $(PYTHON_REQ) >/dev/null 2>&1; then \ - echo "$(GREEN)✓ Installed$(RESET)"; \ - else \ - printf " %-30s " "Trying with --break-system-packages"; \ - if pip3 install --break-system-packages -r $(PYTHON_REQ) >/dev/null 2>&1; then \ - echo "$(GREEN)✓ Installed$(RESET)"; \ - else \ - echo "$(RED)✗ Failed$(RESET)"; \ - fi; \ - fi; \ - else \ - printf " %-30s " "Python packages"; \ - echo "$(YELLOW)⚠ Skipped (pip3/pipx not found)$(RESET)"; \ - fi; \ - else \ + @echo "$(YELLOW)Python venv ($(VENV))/$(PYTHON_REQ):$(RESET)" + @if [ ! -f "$(PYTHON_REQ)" ]; then \ printf " %-30s " "$(PYTHON_REQ)"; \ echo "$(RED)✗ File not found$(RESET)"; \ + elif ! command -v python3 >/dev/null 2>&1; then \ + printf " %-30s " "Python venv"; \ + echo "$(RED)✗ python3 not found$(RESET)"; \ + else \ + if [ ! -d "$(VENV)" ]; then \ + printf " %-30s " "Creating venv"; \ + python3 -m venv "$(VENV)" && echo "$(GREEN)✓ Created$(RESET)" || { echo "$(RED)✗ Failed$(RESET)"; exit 1; }; \ + fi; \ + printf " %-30s " "Installing packages"; \ + if "$(VENV)/bin/pip" install -r "$(PYTHON_REQ)" >/dev/null 2>&1; then \ + echo "$(GREEN)✓ Installed$(RESET)"; \ + echo " $(BLUE)Ansible:$(RESET) $(abspath $(VENV))/bin/ansible-playbook"; \ + else \ + echo "$(RED)✗ Failed$(RESET)"; \ + exit 1; \ + fi; \ fi @echo "" @echo "$(YELLOW)Node.js Dependencies (package.json):$(RESET)" @@ -107,7 +120,9 @@ bootstrap: ## Install all project dependencies from requirements files @echo "" @echo "$(YELLOW)Ansible Collections ($(COLLECTIONS_REQ)):$(RESET)" @if [ -f "$(COLLECTIONS_REQ)" ]; then \ - ansible-galaxy collection install -r $(COLLECTIONS_REQ) 2>&1 | grep -E "(Installing|Skipping|ERROR)" | while read line; do \ + GALAXY="$$(command -v ansible-galaxy)"; \ + [ -x "$(VENV)/bin/ansible-galaxy" ] && GALAXY="$(abspath $(VENV))/bin/ansible-galaxy"; \ + "$$GALAXY" collection install -r $(COLLECTIONS_REQ) 2>&1 | grep -E "(Installing|Skipping|ERROR)" | while read line; do \ if echo "$$line" | grep -q "Installing"; then \ collection=$$(echo "$$line" | awk '{print $$2}' | sed 's/:.*//'); \ printf " $(GREEN)✓ %-30s$(RESET) Installed\n" "$$collection"; \ @@ -117,7 +132,7 @@ bootstrap: ## Install all project dependencies from requirements files elif echo "$$line" | grep -q "ERROR"; then \ printf " $(RED)✗ Error: $$line$(RESET)\n"; \ fi; \ - done || ansible-galaxy collection install -r $(COLLECTIONS_REQ); \ + done || "$$GALAXY" collection install -r $(COLLECTIONS_REQ); \ else \ printf " %-30s " "$(COLLECTIONS_REQ)"; \ echo "$(RED)✗ File not found$(RESET)"; \ @@ -265,6 +280,14 @@ servers: ## Run baseline server playbook (usage: make servers [GROUP=services] [ $(ANSIBLE_PLAYBOOK) $(PLAYBOOK_SERVERS); \ fi +caddy-auth: require-ansible ## Ensure auth.levkin.ca reverse proxy on Caddy VM + @echo "$(YELLOW)Updating Caddy for Authentik...$(RESET)" + $(ANSIBLE_PLAYBOOK) playbooks/caddy-auth-authentik.yml $(ANSIBLE_ARGS) + +caddy-levkin: require-ansible ## Ensure levkin.ca reverse proxy on Caddy VM + @echo "$(YELLOW)Updating Caddy for levkin.ca...$(RESET)" + $(ANSIBLE_PLAYBOOK) playbooks/caddy-levkin-site.yml $(ANSIBLE_ARGS) + workstations: ## Run workstation baseline (usage: make workstations [GROUP=dev] [HOST=dev01]) @echo "$(YELLOW)Applying workstation baseline...$(RESET)" @EXTRA=""; \ @@ -426,7 +449,7 @@ apps: ## Install applications only $(ANSIBLE_PLAYBOOK) $(PLAYBOOK_WORKSTATIONS) --tags apps # Connectivity targets -ping: auto-fallback ## Ping hosts with colored output (usage: make ping [GROUP=dev] [HOST=dev01]) +ping: require-ansible auto-fallback ## Ping hosts with colored output (usage: make ping [GROUP=dev] [HOST=dev01]) ifdef HOST @echo "$(YELLOW)Pinging host: $(HOST)$(RESET)" @ansible $(HOST) -m ping --one-line | while read line; do \ @@ -543,16 +566,25 @@ tailscale-status: ## Check Tailscale status on all machines done # Vault management -edit-vault: ## Edit encrypted host vars (usage: make edit-vault HOST=dev01) +edit-vault: require-ansible ## Edit encrypted host vars (usage: make edit-vault HOST=KrakenMint) ifndef HOST @echo "$(RED)Error: HOST parameter required$(RESET)" - @echo "Usage: make edit-vault HOST=dev01" + @echo "Usage: make edit-vault HOST=KrakenMint" @exit 1 endif - ansible-vault edit host_vars/$(HOST).yml + @vault_file="$(INVENTORY)/host_vars/$(HOST)/vault.yml"; \ + if [ ! -f "$$vault_file" ]; then vault_file="$(INVENTORY)/host_vars/$(HOST).yml"; fi; \ + if [ ! -f "$$vault_file" ]; then \ + echo "$(RED)No vault file for $(HOST):$(RESET)"; \ + echo " $(INVENTORY)/host_vars/$(HOST)/vault.yml"; \ + echo " $(INVENTORY)/host_vars/$(HOST).yml"; \ + exit 1; \ + fi; \ + echo "$(BLUE)Editing $$vault_file$(RESET)"; \ + $(ANSIBLE_VAULT) edit "$$vault_file" -edit-group-vault: ## Edit encrypted group vars (usage: make edit-group-vault) - ansible-vault edit inventories/production/group_vars/all/vault.yml +edit-group-vault: require-ansible ## Edit encrypted group vars (usage: make edit-group-vault) + $(ANSIBLE_VAULT) edit $(INVENTORY)/group_vars/all/vault.yml copy-ssh-key: ## Copy SSH key to specific host (usage: make copy-ssh-key HOST=giteaVM) @@ -562,19 +594,128 @@ ifndef HOST @exit 1 endif @echo "$(YELLOW)Copying SSH key to $(HOST)...$(RESET)" - @ip=$$(ansible-inventory --list | jq -r "._meta.hostvars.$(HOST).ansible_host // empty" 2>/dev/null); \ - user=$$(ansible-inventory --list | jq -r "._meta.hostvars.$(HOST).ansible_user // empty" 2>/dev/null); \ - if [ -n "$$ip" ] && [ "$$ip" != "null" ] && [ -n "$$user" ] && [ "$$user" != "null" ]; then \ + @ip=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r --arg h "$(HOST)" '._meta.hostvars[$$h].ansible_host // empty'); \ + user=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r --arg h "$(HOST)" '._meta.hostvars[$$h].ansible_user // empty'); \ + if [ -z "$$ip" ] || [ "$$ip" = "null" ]; then \ + ip=$$(awk -v h="$(HOST)" '$$1==h {print $$2}' $(INVENTORY_HOSTS) | sed 's/ansible_host=//'); \ + fi; \ + if [ -z "$$user" ] || [ "$$user" = "null" ]; then \ + user=$$(awk -v h="$(HOST)" '$$1==h {for(i=2;i<=NF;i++) if($$i~/^ansible_user=/) {sub(/ansible_user=/,"",$$i); print $$i; exit}}' $(INVENTORY_HOSTS)); \ + fi; \ + if [ -n "$$ip" ] && [ -n "$$user" ]; then \ echo "Target: $$user@$$ip"; \ - ssh-copy-id $$user@$$ip; \ + ssh-copy-id -i "$${SSH_PUBLIC_KEY:-$$HOME/.ssh/id_ed25519.pub}" "$$user@$$ip"; \ else \ echo "$(RED)Could not determine IP or user for $(HOST)$(RESET)"; \ echo "Check your inventory and host_vars"; \ + exit 1; \ fi -create-vault: ## Create encrypted vault file for secrets (passwords, auth keys, etc.) +copy-ssh-keys: ## Copy SSH key to all inventory hosts (usage: make copy-ssh-keys [GROUP=services]) + @echo "$(YELLOW)Copying SSH key to inventory hosts...$(RESET)" + @echo "Using key: $${SSH_PUBLIC_KEY:-$$HOME/.ssh/id_ed25519.pub}" + @echo "$(YELLOW)You will be prompted for each host's password (last time).$(RESET)" + @failed=0; ok=0; \ + if [ -n "$(GROUP)" ]; then \ + hosts=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r ".\"$(GROUP)\".hosts[]? // empty"); \ + else \ + hosts=$$(ansible-inventory -i $(INVENTORY) $(ANSIBLE_ARGS) --list 2>/dev/null | jq -r '._meta.hostvars | keys[]' | grep -v '^localhost$$' | sort); \ + fi; \ + if [ -z "$$hosts" ]; then \ + if [ -n "$(GROUP)" ]; then \ + hosts=$$(awk -v g="$(GROUP)" 'BEGIN{ing=0} /^\[/ {ing=($$0=="["g"]"); next} ing && /^[a-zA-Z]/ {print $$1}' $(INVENTORY_HOSTS)); \ + else \ + hosts=$$(awk '/^\[/ {next} /^[a-zA-Z]/ && $$1!="localhost" {print $$1}' $(INVENTORY_HOSTS)); \ + fi; \ + fi; \ + for host in $$hosts; do \ + echo ""; echo "$(BLUE)==> $$host$(RESET)"; \ + if $(MAKE) --no-print-directory copy-ssh-key HOST=$$host; then ok=$$((ok+1)); else failed=$$((failed+1)); fi; \ + done; \ + echo ""; \ + echo "$(GREEN)Done: $$ok succeeded$(RESET), $(RED)$$failed failed$(RESET)"; \ + [ $$failed -eq 0 ] + +copy-ssh-keys-ansible: require-ansible ## Copy SSH key via Ansible (usage: make copy-ssh-keys-ansible [GROUP=services] [HOST=dev01]) + @echo "$(YELLOW)Deploying SSH key with Ansible (may prompt for SSH password)...$(RESET)" + @limit="all:!local"; \ + [ -n "$(GROUP)" ] && limit="$(GROUP)"; \ + [ -n "$(HOST)" ] && limit="$(HOST)"; \ + $(ANSIBLE_PLAYBOOK) playbooks/ssh-keys.yml $(ANSIBLE_ARGS) --limit "$$limit" --ask-pass + +copy-ssh-key-mailcow: ## Copy SSH key to Mailcow VM (root@10.0.10.132 on pve201; prompts for root password once) + @$(MAKE) --no-print-directory copy-ssh-key HOST=mailcow + +bootstrap-root-ssh-caddy: ## Bootstrap root on caddy via su + vault_lxc_root_password + @chmod +x scripts/bootstrap-root-ssh-su-password.sh scripts/load-vault-lxc-root-password.sh + @. scripts/load-vault-lxc-root-password.sh; ./scripts/bootstrap-root-ssh-su-password.sh caddy + +bootstrap-root-ssh: ## SSH as ladmin, su to root, install root key (usage: make bootstrap-root-ssh HOST=listmonk) +ifndef HOST + @echo "$(RED)Error: HOST parameter required$(RESET)" + @echo "Usage: make bootstrap-root-ssh HOST=listmonk" + @exit 1 +endif + @chmod +x scripts/bootstrap-root-ssh.sh + @BOOTSTRAP_USER="$(BOOTSTRAP_USER)" TARGET_USER="$(TARGET_USER)" \ + scripts/bootstrap-root-ssh.sh "$(HOST)" + +bootstrap-root-ssh-services: ## Bootstrap root SSH via ladmin (caddy, listmonk, vikanjans) + @chmod +x scripts/bootstrap-root-ssh.sh + @failed=0; ok=0; \ + for host in caddy listmonk vikanjans; do \ + echo ""; echo "$(BLUE)==> $$host$(RESET)"; \ + if BOOTSTRAP_USER="$(BOOTSTRAP_USER)" scripts/bootstrap-root-ssh.sh "$$host"; then \ + ok=$$((ok+1)); \ + else \ + failed=$$((failed+1)); \ + fi; \ + done; \ + echo ""; echo "$(GREEN)Done: $$ok succeeded$(RESET), $(RED)$$failed failed$(RESET)"; \ + [ $$failed -eq 0 ] + +mailcow-mailbox: ## Create Mailcow mailbox (usage: make mailcow-mailbox MAILBOX=alerts) +ifndef MAILBOX + @echo "$(RED)Error: MAILBOX required$(RESET)" + @echo "Usage: make mailcow-mailbox MAILBOX=alerts" + @echo "Define mailboxes in inventories/production/group_vars/all/mailcow.yml" + @exit 1 +endif + @chmod +x scripts/run-mailcow-mailbox.sh + @MAILBOX="$(MAILBOX)" ./scripts/run-mailcow-mailbox.sh + +mailcow-create-alerts: ## Alias for make mailcow-mailbox MAILBOX=alerts + @$(MAKE) --no-print-directory mailcow-mailbox MAILBOX=alerts + +vault-pull-infra-secrets: ## Pull Umami/Mattermost from hosts → .env → vault (not vault→.env) + @chmod +x scripts/vault-pull-infra-secrets.sh scripts/vault-import-env.sh + @./scripts/vault-pull-infra-secrets.sh + +vault-export-env: ## Write vault secrets into .env (keeps existing non-empty keys) + @chmod +x scripts/vault-export-env.sh + @./scripts/vault-export-env.sh "$(or $(ENV_FILE),.env)" + +vault-import-env: ## Merge .env secrets into Ansible vault (usage: make vault-import-env [ENV_FILE=.env]) + @chmod +x scripts/vault-import-env.sh + @ENV_FILE="$(or $(ENV_FILE),.env)" scripts/vault-import-env.sh "$(or $(ENV_FILE),.env)" + +bootstrap-root-ssh-failed: ## Bootstrap root SSH on hosts that failed direct root copy-ssh-keys + @chmod +x scripts/bootstrap-root-ssh.sh + @failed=0; ok=0; \ + for host in caddy listmonk vikanjans n8n qBittorrent actual caseware auto mailcow; do \ + echo ""; echo "$(BLUE)==> $$host$(RESET)"; \ + if BOOTSTRAP_USER="$(BOOTSTRAP_USER)" scripts/bootstrap-root-ssh.sh "$$host"; then \ + ok=$$((ok+1)); \ + else \ + failed=$$((failed+1)); \ + fi; \ + done; \ + echo ""; echo "$(GREEN)Done: $$ok succeeded$(RESET), $(RED)$$failed failed$(RESET)"; \ + [ $$failed -eq 0 ] + +create-vault: require-ansible ## Create encrypted vault file for secrets (passwords, auth keys, etc.) @echo "$(YELLOW)Creating vault file for storing secrets...$(RESET)" - ansible-vault create group_vars/all/vault.yml + $(ANSIBLE_VAULT) create $(INVENTORY)/group_vars/all/vault.yml @echo "$(GREEN)✓ Vault file created. Add your secrets here (e.g. vault_tailscale_auth_key)$(RESET)" create-vm: ## Create Ansible controller VM on Proxmox diff --git a/ansible.cfg b/ansible.cfg index 7d31b58..575c53c 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -2,7 +2,8 @@ inventory = inventories/production roles_path = roles host_key_checking = False -stdout_callback = yaml +stdout_callback = default +callback_result_format = yaml bin_ansible_callbacks = True retry_files_enabled = False gathering = smart diff --git a/auto-fallback.sh b/auto-fallback.sh index 5078329..7ad1299 100755 --- a/auto-fallback.sh +++ b/auto-fallback.sh @@ -4,6 +4,7 @@ HOSTS_FILE="inventories/production/hosts" TIMEOUT=3 CHANGED=false +UNAME_S="$(uname -s)" # Colors GREEN='\033[0;32m' @@ -18,10 +19,12 @@ echo "==================================================================" # Function to test IP connectivity test_ip() { local ip="$1" - if ping -c 1 -W "$TIMEOUT" "$ip" >/dev/null 2>&1; then - return 0 + if [[ "$UNAME_S" == "Darwin" ]]; then + # macOS: -W is wait time in milliseconds + ping -c 1 -W $((TIMEOUT * 1000)) "$ip" >/dev/null 2>&1 else - return 1 + # Linux: -W is timeout in seconds + ping -c 1 -W "$TIMEOUT" "$ip" >/dev/null 2>&1 fi } @@ -31,7 +34,7 @@ test_ssh() { local ip="$2" local user="$3" - if timeout 5 ssh -o ConnectTimeout=3 -o BatchMode=yes "$user@$ip" exit >/dev/null 2>&1; then + if ssh -o ConnectTimeout=3 -o BatchMode=yes "$user@$ip" exit >/dev/null 2>&1; then return 0 else return 1 @@ -46,11 +49,14 @@ switch_to_fallback() { echo -e " ${YELLOW}→ Switching $hostname to fallback IP: $fallback_ip${NC}" - # Use sed to replace the primary IP with fallback IP - sed -i "s/$hostname ansible_host=$primary_ip/$hostname ansible_host=$fallback_ip/" "$HOSTS_FILE" - - # Remove the fallback attribute since we're now using it as primary - sed -i "s/ ansible_host_fallback=$fallback_ip//" "$HOSTS_FILE" + # Use sed to replace the primary IP with fallback IP (BSD/GNU compatible) + if [[ "$UNAME_S" == "Darwin" ]]; then + sed -i '' "s/$hostname ansible_host=$primary_ip/$hostname ansible_host=$fallback_ip/" "$HOSTS_FILE" + sed -i '' "s/ ansible_host_fallback=$fallback_ip//" "$HOSTS_FILE" + else + sed -i "s/$hostname ansible_host=$primary_ip/$hostname ansible_host=$fallback_ip/" "$HOSTS_FILE" + sed -i "s/ ansible_host_fallback=$fallback_ip//" "$HOSTS_FILE" + fi CHANGED=true } @@ -66,9 +72,10 @@ while IFS= read -r line; do # Parse host entry if [[ "$line" =~ ansible_host= ]]; then hostname=$(echo "$line" | awk '{print $1}') - primary_ip=$(echo "$line" | grep -oP 'ansible_host=\K[^\s]+') - fallback_ip=$(echo "$line" | grep -oP 'ansible_host_fallback=\K[^\s]+' || echo "") - user=$(echo "$line" | grep -oP 'ansible_user=\K[^\s]+' || echo "root") + primary_ip=$(echo "$line" | sed -n 's/.*ansible_host=\([^[:space:]]*\).*/\1/p') + fallback_ip=$(echo "$line" | sed -n 's/.*ansible_host_fallback=\([^[:space:]]*\).*/\1/p') + user=$(echo "$line" | sed -n 's/.*ansible_user=\([^[:space:]]*\).*/\1/p') + [[ -z "$user" ]] && user="root" echo -n "Testing $hostname ($primary_ip)... " diff --git a/docs/guides/ansible-vault-secrets.md b/docs/guides/ansible-vault-secrets.md new file mode 100644 index 0000000..f386daf --- /dev/null +++ b/docs/guides/ansible-vault-secrets.md @@ -0,0 +1,60 @@ +# Encrypted secrets in this project + +Ansible Vault is the standard way to store and share secrets with this repo. Plain `.env` files are gitignored and meant only as a **temporary** import path on your machine. + +## Recommended workflow + +1. **Never commit** `.env`, API keys, or passwords. +2. Store secrets in `inventories/production/group_vars/all/vault.yml` (encrypted). +3. Edit with `make edit-group-vault` (uses `~/.ansible-vault-pass` on your workstation). +4. Teammates need the same vault password file out-of-band (password manager, not git). + +## One-time import from `.env` + +```bash +cp .env.example .env +# fill MAILCOW_API_KEY, ALERTS_PASSWORD, etc. +make vault-import-env +rm .env # optional after import +``` + +`make vault-import-env` merges supported keys into the vault and re-encrypts the file. + +## Mailcow mailboxes (dynamic) + +| File | Purpose | +|------|---------| +| `group_vars/all/mailcow.yml` | Mailbox names, local parts, quotas (no secrets) | +| `vault.yml` | `vault_mailcow_api_key`, `vault_mailcow_mailbox_passwords` | + +```bash +make mailcow-mailbox MAILBOX=alerts +``` + +Add a new mailbox: + +1. In `mailcow.yml` under `mailcow_mailboxes:` add e.g. `notify: { local_part: notify, name: Notify, quota: 512, vault_password_key: notify }` +2. In vault: `vault_mailcow_mailbox_passwords.notify: "..."` (via `make edit-group-vault`) +3. `make mailcow-mailbox MAILBOX=notify` + +## Can `.env` itself be encrypted? + +Yes, but Ansible projects usually skip that pattern: + +| Approach | Use when | +|----------|----------| +| **Ansible Vault** (`vault.yml`) | Default for this repo — works with playbooks and `make` targets | +| **`ansible-vault encrypt .env`** | Produces `.env` vault blob; you must `ansible-vault view .env` or decrypt to a temp file before tools read it — awkward for shell scripts | +| **Password manager / 1Password CLI** | Personal machine only, not for CI/ansible runs | +| **SOPS / Mozilla SOPS** | Teams that want encrypted YAML/JSON in git with KMS/PGP — heavier setup | + +**Sharing encrypted secrets with others:** share the **vault password** (or per-host vault pass) securely once; they clone the repo and use the same encrypted `vault.yml`. Do not email `.env` files. + +## Encrypting a single value (without opening the whole file) + +```bash +ansible-vault encrypt_string 'secret-value' --name 'vault_my_secret' \ + --vault-password-file ~/.ansible-vault-pass +``` + +Paste the output into `vault.yml` inside the encrypted file, or into a vars file that is entirely vault-encrypted. diff --git a/docs/guides/homelab-status-2026-05-22.md b/docs/guides/homelab-status-2026-05-22.md new file mode 100644 index 0000000..4bae4f7 --- /dev/null +++ b/docs/guides/homelab-status-2026-05-22.md @@ -0,0 +1,66 @@ +# Homelab status — 2026-05-22 + +Quick checklist after monitoring / sites / git pass. + +## Done (automation) + +| Item | Notes | +|------|--------| +| Mailcow `alerts@levkine.ca` | Created via API | +| Kuma + Dockge + Umami | LXC 218 @ `10.0.10.22`; Dockge stack **monitoring** active | +| Old Kuma pve201 LXC 305 | Stopped, `onboot` off | +| `stats.levkin.ca` | Caddy → Umami `:3000` | +| Tracking scripts | levkin.ca + caseware + auto + portfolio (`iliadobkin.com`) | +| **levkin.ca** | LXC **220** @ `10.0.10.60`; Caddy → nginx; `/` = spec, `/folders/` = stack | +| Portfolio `iliadobkin.com` | Migrated pve201 LXC **306** → pve10 LXC **219** @ `10.0.10.106`; Caddy → nginx `:80` | +| Kuma SMTP | Working (user confirmed) | +| Git remote | `git@git.levkin.ca:ilia/...` (SSH → `10.0.10.169` via `~/.ssh/config` on site LXCs) | +| auto repo | Pushed/pulled on `git.levkin.ca` | +| caseware repo | Pushed to Gitea via bundle on server; LXCs pull via internal SSH | +| Vault | Mailcow, Umami, Mattermost in vault; `make vault-export-env` → `.env`; `make vault-pull-infra-secrets` = hosts → vault | +| Caddy root SSH | Works (`make bootstrap-root-ssh-caddy`) | +| Hermes Mattermost | `mattermost.env` on VM; Telegram optional/off | + +## Your list — still to do + +### You (UI / hardware / DNS) + +- [x] **Kuma SMTP** — working +- [ ] **DNS `levkin.ca` + `www`** — A records → home IP (`142.180.237.136`); apex currently parked at AWS, not homelab +- [ ] **Gitea deploy key (levkin LXC 220)** — add `deploy-levkin-levkin.ca` pubkey in repo settings (SSH pull); HTTPS clone works meanwhile +- [ ] **UniFi DHCP reservations** — [unifi-static-dhcp.md](unifi-static-dhcp.md) @ https://192.168.2.1/ +- [ ] **Cal.com → Authentik OIDC** — first SSO (~1–2 h) — [levkin-selfhost-plan-2.md](levkin-selfhost-plan-2.md) +- [ ] **Nextcloud VM 201 retire** — remove Kuma monitor, Caddy `nextcloud.levkin.ca`, stop VM +- [ ] **NAS.SP00 disk replace** — then start Jellyfin (VM 101) +- [x] **Gitea deploy key (portfolio)** — `git pull` works on LXC 219; Gitea VM SSH fixed (`/home/git/.ssh/authorized_keys` + `sudo` to `gitea`) +- [ ] **`.env`** — optional mirror: `make vault-export-env` (vault already has secrets) +- [ ] **Rotate** any secrets pasted in chat (Hermes token, etc.) + +### Later / defer + +- [ ] Caddy → edge LXC `.20` +- [ ] Immich, Crater, Beszel +- [ ] Public SSH for `git.levkin.ca:22` (optional Caddy `layer4` or DNS split) + +## Site LXCs (marketing) + +| VMID | Name | IP | Git remote | +|------|------|-----|------------| +| 220 | levkin | 10.0.10.60 | `git@git.levkin.ca:ilia/levkin.ca.git` | +| 215 | caseware | 10.0.10.105 | `git@git.levkin.ca:ilia/caseware.git` | +| 216 | auto | 10.0.10.59 | `git@git.levkin.ca:ilia/auto.git` | +| 219 | portfolio | 10.0.10.106 | `git@git.levkin.ca:ilia/sdetProfile.git` | + +**Git SSH note:** `git.levkin.ca` in the URL; traffic goes to **10.0.10.169:22** (not `10.0.30.169`, not public `:22`). + +```ssh +# On each site LXC /root/.ssh/config +Host git.levkin.ca + HostName 10.0.10.169 + User git + IdentityFile ~/.ssh/id_ed25519 +``` + +## Dockge + +Stack **monitoring** in UI = correct. Compose at `/opt/stacks/monitoring/compose.yaml`. Live stack also at `/opt/monitoring` (same containers). Use Dockge for edits/restarts; avoid starting a second copy. diff --git a/docs/guides/host-list.md b/docs/guides/host-list.md new file mode 100644 index 0000000..8cf3e54 --- /dev/null +++ b/docs/guides/host-list.md @@ -0,0 +1,142 @@ +# Host list — Proxmox guests (source of truth) + +**Node:** PVENAS (`pve10` @ `10.0.10.10`) +**Audited:** 2026-05-22 (Phase 0 IP pass + monitoring LXC 218 provisioned) +**LAN:** `10.0.10.0/24`, gateway `10.0.10.1` + +Update this file whenever a guest is created, migrated, or re-IP’d. See [levkin-selfhost-plan-2.md](levkin-selfhost-plan-2.md) for IP range policy. + +--- + +## IP range plan (10.0.10.0/24) + +| Range | Reserved for | +|-------|----------------| +| `.1–.9` | Network gear | +| `.10–.19` | Proxmox host(s) + PBS | +| `.20–.39` | Edge / identity / comms | +| `.40–.79` | Application LXCs / VMs | +| `.80–.99` | Media VMs | +| `.100–.199` | DHCP pool (clients) | +| `.200–.249` | Labs / heavy VMs | +| `.250–.254` | Reserved | + +**Rollout reservations (free):** `.20` edge LXC + +--- + +## Proxmox host + +| VMID | Name | Role | Current IP | Target static IP | DHCP/Static | Notes | +|------|------|------|------------|------------------|-------------|-------| +| — | **pve10** | Proxmox (PVENAS) | `10.0.10.10/24` | `.10` | Static | This node | + +--- + +## LXCs (pve10) + +| VMID | Name | Plan group | Current IP | Target static IP | DHCP/Static | MAC | Notes | +|------|------|------------|------------|------------------|-------------|-----|-------| +| 210 | cal | business | `10.0.10.228/24` | `10.0.10.228/24` | ✅ **Static** | `BC:24:11:DD:F8:7C` | Cal.com — `pct set` applied; in Ansible `hosts` | +| 215 | caseware | **marketing site** | `10.0.10.105/24` | `10.0.10.105/24` | ✅ **Static** | `BC:24:11:72:04:53` | Static HTML `/var/www/caseware` → `caseware.levkin.ca` | +| 216 | auto | **marketing site** | `10.0.10.59/24` | `10.0.10.59/24` | ✅ **Static** | `BC:24:11:43:F0:86` | Static HTML `/var/www/auto` → `auto.levkin.ca` | +| 219 | portfolio | **marketing site** | `10.0.10.106/24` | `10.0.10.106/24` | ✅ **Static** | `BC:24:11:DF:94:32` | Static HTML `/var/www/portfolio` → `iliadobkin.com` (migrated from pve201 LXC 306) | +| 220 | levkin | **marketing site** | `10.0.10.60/24` | `10.0.10.60/24` | ✅ **Static** | `BC:24:11:C6:B2:E4` | Vite `www/` → `levkin.ca` (spec), `levkin.ca/folders` (stack) — [site-lxc-git.md](site-lxc-git.md) | +| 217 | identity | identity | `10.0.10.21/24` | `10.0.10.21/24` | ✅ **Static** | `BC:24:11:3C:85:45` | Authentik + Postgres + Redis; `auth.levkin.ca` via Caddy | +| 218 | monitoring | monitoring | `10.0.10.22/24` | `10.0.10.22/24` | ✅ **Static** | `BC:24:11:54:43:13` | Uptime Kuma `:3001`, Dockge `:5001`, Umami `:3000` — see [monitoring-stack.md](monitoring-stack.md) | + +**pve201 (not pve10):** LXC **305** `kuma-debian` @ `10.0.10.197` — **stopped 2026-05-22** (replaced by monitoring LXC 218). `onboot` disabled. LXC **306** `portfolio` — **destroyed/purged 2026-05-22** (now pve10 LXC **219** @ `10.0.10.106`). + +--- + +## VMs (pve10) + +| VMID | Name | Plan group | Current IP | Target static IP | DHCP/Static | MAC | Notes | +|------|------|------------|------------|------------------|-------------|-----|-------| +| 100 | homepage-debian | — | — | — | — | — | **Stopped** | +| 101 | Jellyfin | media | `10.0.10.232` | `10.0.10.232/24` | ⏳ DHCP? | `BC:24:11:29:B8:84` | **Stopped** (turned off 2026-05-22); inventory `jellyfin` | +| 102 | gitea-alpine | — | `10.0.10.169/24` | `10.0.10.169/24` | ⏳ stable DHCP | `BC:24:11:E9:BD:E5` | Pin in-guest or router reservation | +| 103 | WRA | — | `10.0.10.154/24` | `10.0.10.154/24` | ⏳ stable DHCP | `BC:24:11:61:DE:7A` | Inventory `n8n`; pin when automating | +| 104 | vaultwarden-debian | identity | `10.0.10.142/24` | `10.0.10.142/24` | ⏳ stable DHCP | `BC:24:11:58:DB:DC` | Inventory `vaultwardenVM` | +| 105 | TrueNAS | — | `10.0.10.107/24` | `10.0.10.107/24` | ⏳ stable DHCP | `BC:24:11:14:DE:B5` | NAS UI; pool `NAS.SP00` degraded | +| 106 | caddy-debian | **edge** | `10.0.10.50/24` | `10.0.10.50/24` → **`.20`** (Phase 1.5) | ✅ **Static** (in-guest) | `BC:24:11:E0:49:B4` | `/etc/network/interfaces` static; Ansible `caddy` | +| 107 | mattermost-ubuntu | comms | `10.0.10.107`? | TBD | ⏳ | `BC:24:11:66:6E:01` | Ping `.107` up; confirm not TrueNAS conflict — verify in guest | +| 108 | actual-debian | business | `10.0.10.158/24` | `10.0.10.158/24` | ⏳ stable DHCP | `BC:24:11:10:7B:64` | Inventory `actual` | +| 109 | portainer-alpine | — | unknown | — | ⏳ | `BC:24:11:0F:40:4F` | Running; retire → Dockge on monitoring LXC | +| 150 | pihole00-debian | — | link-local* | TBD | ⏳ | `BC:24:11:86:76:97` | Running | +| 117 | hermes | services | `10.0.10.36/24` | `10.0.10.36/24` | ⏳ stable DHCP | `BC:24:11:51:1E:99` | On pve10; guest agent; inventory `hermes` | +| 200 | PVE.BU.SVR | labs | `10.0.10.200/24` | `10.0.10.200/24` | ⏳ stable DHCP | `BC:24:11:DA:95:3B` | Running | +| 201 | NextcloudAIO-debian | (decommission) | `10.0.10.24/24` | — | 🗑️ **Retiring** | `BC:24:11:14:D4:DE` | Export done; remove Caddy + Kuma monitor, then stop VM | +| 300 | pihole-debian | — | — | — | — | — | **Stopped** | + +\* ARP showed IPv6 link-local only at audit time — confirm IPv4 inside guest or install QEMU guest agent. + +--- + +## Inventory cross-reference (Ansible `hosts`) + +| Inventory name | IP in hosts | pve10 guest | Match | +|----------------|-------------|-------------|-------| +| caddy | `10.0.10.50` | VM 106 | ✅ | +| cal | `10.0.10.228` | LXC 210 | ✅ | +| caseware | `10.0.10.105` | LXC 215 | ✅ | +| auto | `10.0.10.59` | LXC 216 | ✅ | +| portfolio | `10.0.10.106` | LXC 219 | ✅ | +| levkin | `10.0.10.60` | LXC 220 | ✅ | +| identity | `10.0.10.21` | LXC 217 | ✅ | +| monitoring | `10.0.10.22` | LXC 218 | ✅ | +| vaultwardenVM | `10.0.10.142` | VM 104 | ✅ | +| giteaVM | `10.0.10.169` | VM 102 | ✅ | +| n8n | `10.0.10.154` | VM 103? | ⚠️ verify (WRA vs n8n) | +| listmonk | `10.0.10.148` | — | On **pve201** (`[comms]`) | +| mailcow | `10.0.10.132` | pve201 VM 106 | ✅ `[comms]` | +| hermes | `10.0.10.36` | VM 117 | ✅ on pve10 | +| jellyfin | `10.0.10.232` | VM 101 | ✅ (stopped until NAS healthy) | +| nextcloud | `10.0.10.24` | VM 201 | commented out (retiring) | +| portainerVM | — | VM 109 | removed (Dockge on monitoring) | + +--- + +## Static IP conversion queue (pve10) + +Priority order (plan-2): + +1. ✅ **LXC 210** — done (`10.0.10.228/24`) +2. ✅ **LXC 215, 216** — pinned (`.105`, `.59`) +3. ✅ **LXC 217** (identity) — `10.0.10.21/24`, Authentik deployed +4. ✅ **VM 106** (caddy) — static in-guest `.50` +5. ✅ **LXC 218** (monitoring) — `.22`, Kuma/Dockge/Umami +6. **VMs** — use [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md) (router MAC reservations); skip **201** (Nextcloud retire) +7. **New:** edge LXC @ **`.20`** (Phase 1.5) + +Example: + +```bash +# On pve10 (PVENAS) +pct set 215 -net0 name=eth0,bridge=vmbr0,ip=10.0.10.105/24,gw=10.0.10.1 +pct set 216 -net0 name=eth0,bridge=vmbr0,ip=10.0.10.59/24,gw=10.0.10.1 +``` + +--- + +## NAS / storage note + +- ZFS pool **`NAS.SP00`** on this node: **DEGRADED** (disk `W4J0L3PY` failed). See [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md), [nas-sp00-smart-audit-2026-05-21.md](nas-sp00-smart-audit-2026-05-21.md). +- VM **201** root disk on NAS — avoid heavy I/O until pool is healthy. + +--- + +## Audit checklist + +- [x] `pct list` / `qm list` on pve10 +- [x] ARP / ping for running guests +- [ ] `pct exec` / guest agent for VMs missing IPv4 +- [x] Initial `host-list.md` created +- [x] Pin 215/216 static +- [x] Identity LXC 217 @ `.21` (Authentik Phase 1 infra) +- [x] Monitoring LXC 218 @ `.22` +- [x] Caddy VM 106 static `.50` +- [x] LXC backups `backup-20260522` on 217, 218 +- [ ] Router DHCP reservations for VMs — [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md) (manual in router UI; table ready) +- [ ] Retire VM 201 (Nextcloud) +- [ ] Re-run after NAS disk replace diff --git a/docs/guides/levkin-selfhost-plan-2.md b/docs/guides/levkin-selfhost-plan-2.md new file mode 100644 index 0000000..bc27440 --- /dev/null +++ b/docs/guides/levkin-selfhost-plan-2.md @@ -0,0 +1,419 @@ +# Levkin self-hosted stack — plan & decisions + +Reference doc for the Proxmox homelab. Lives alongside the Cursor project that has the Proxmox info. + +**Conventions:** +- All groups run inside an LXC unless marked **VM**. +- Inside each LXC: one `docker-compose.yml`, managed by **Dockge** where applicable. +- Caddy on the `edge` LXC is the only thing exposed to the internet. +- Authentik on the `identity` LXC is the source of truth for who you are. +- Vaultwarden stays standalone (it's the break-glass path if Authentik dies). + +--- + +## Progress summary (updated 2026-05-23) + +| Area | Status | +|------|--------| +| **Phase 0** Foundation | ✅ Mostly done — static IPs on pve10 LXCs; Caddy still on **VM 106** | +| **Phase 1** Identity (Authentik) | ✅ LXC **217** @ `10.0.10.21` | +| **Phase 2** Monitoring (Kuma, Dockge, Umami) | ✅ LXC **218** @ `10.0.10.22` | +| **Phase 3** Cal.com | ✅ LXC **210** — OIDC + auto site button still open | +| **Phase 4** SSO migration | ⏳ Not started (Cal → Authentik first) | +| **Phase 5–8** Immich, Crater, Outline, etc. | ⏳ Deferred | +| **Site consolidation** | ⏳ **Partial** — **levkin.ca** on LXC **220** @ `10.0.10.60` ✅; caseware/auto/portfolio on **215/216/219** ([site-lxc-git.md](site-lxc-git.md)); moving all static to Caddy VM is optional later | +| **dev-apps** (punim/pote/mirrormatch) | ⏳ **Not started** — punimTag **9101** still on **pve201** (active testing; do not migrate yet) | +| **Nextcloud retire** | ⏳ VM **201 is running again** on pve10 — finish decommission | +| **Portainer retire** | ⏳ VM **109 still running** (16 GB maxmem) on pve10 — stop after Dockge confirmed | + +--- + +## Capacity headroom (live check 2026-05-23) + +Use this before adding LXCs/VMs. Re-check with `pvesm status` and `free -h` on each node. + +### pve10 (PVENAS) — **primary place for new homelab services** + +| Resource | Total | Used | **Available** | Notes | +|----------|-------|------|---------------|--------| +| **local-lvm** (thin) | ~1.67 TiB | ~22% | **~1.30 TiB** | Plenty of disk for new LXCs | +| **RAM** (host) | 62 GiB | ~44 GiB | **~17 GiB** | Enough for **2–3 small LXCs** (2 GB each) as-is | + +**Realistic new capacity on pve10 (without stopping anything):** ~**4–6 GiB RAM** + **100–200 GiB disk** for one productivity/media LXC (Outline, Mealie, Immich-lite). + +**If you free RAM first (recommended):** + +| Stop / retire | Frees (maxmem) | +|---------------|----------------| +| Portainer VM **109** | **16 GiB** | +| Nextcloud VM **201** | **8 GiB** | +| Hermes VM **117** (if not needed) | **16 GiB** | +| Site LXCs 215/216 → Caddy static (future) | **~1 GiB** | + +After Portainer + Nextcloud off: **~41 GiB effective headroom** on pve10 — room for Immich, Crater, Beszel, or a **dev-apps** LXC (6–8 GiB). + +### pve201 (pve) — **do not add new services** + +| Resource | Total | Used | **Available** | Notes | +|----------|-------|------|---------------|--------| +| **local-lvm** | ~1.67 TiB | ~46% | **~922 GiB** | Disk OK | +| **RAM** | 125 GiB | ~122 GiB | **~3 GiB** | Saturated; GPU VM **104** (73 GB), punimTag **9101** (16 GB) | + +**Verdict:** New stacks belong on **pve10**. pve201 only benefits from **stopping/migrating** guests (punim after testing, GPU resize, old Kuma already stopped). + +--- + +## Current state (May 2026) + +**Already running:** +- Caddy reverse proxy — currently on a **VM** (should migrate to LXC, see "Caddy migration" section) +- Mailcow — VM, mail domain is `levkine.ca` (with e) +- Vaultwarden, Vikunja, n8n, Listmonk, Mattermost, Nextcloud — across various LXCs +- **Cal.com** — LXC id `210`, `cal.levkin.ca`, Postgres included, admin user `ilia`, 15-min consult event live at `cal.levkin.ca/ilia/consult` with Jitsi link +- Caddy entries live for: `levkin.ca`, `caseware.levkin.ca`, `auto.levkin.ca`, `iliadobkin.com`, `cal.levkin.ca`, `listmonk.levkin.ca`, `pdf.levkin.ca`, `search.levkin.ca`, `auth.levkin.ca`, `stats.levkin.ca` +- **Authentik** — LXC **217** @ `10.0.10.21`, `https://auth.levkin.ca`, admin + TOTP enrolled +- **Monitoring** — LXC **218** @ `10.0.10.22`: Uptime Kuma `:3001`, Dockge `:5001`, Umami `:3000` (LAN-only) — [monitoring-stack.md](monitoring-stack.md) +- **Umami** + **Authentik** admin/TOTP/backup codes — done +- **Uptime Kuma** — monitors live; email alerts via Mailcow — see [monitoring-stack.md](monitoring-stack.md) +- **Dockge** on 218 — manages local `/opt/monitoring` stack +- **Snapshots** `backup-20260522` on LXCs **217**, **218** +- **Jellyfin** (VM 101) — stopped +- LXC **210, 215–218, 219** — static via `pct set`; **Caddy VM 106** — static in-guest `.50` +- **Nextcloud VM 201** — export done; VM **still running** on pve10 — **retire next** (8 GB RAM reclaimed) +- **Portainer VM 109** — still **running** on pve10 (16 GB) — retire; Dockge on 218 replaces it +- **Marketing sites** — LXC **220** (`levkin.ca`), **215/216/219** (git deploy), not yet on Caddy VM static roots +- **punimTag dev** — pve201 LXC **9101** @ `10.0.10.121` (16 GB) — leave until testing done; then `dev-apps` on pve10 + +**Decisions locked in:** +- Container manager: **Dockge** (not Portainer, not Coolify/Dokploy/CapRover) +- Chat: **Mattermost only** — no Matrix/Synapse +- Knowledge tool: **Outline** for client-facing, **SiYuan** if/when PhD work picks up (don't run Affine + Trilium too) +- Bookmark manager: **Linkwarden** (full-page archive is the killer feature) +- Authentik is the SSO target; Vaultwarden stays standalone + +--- + +## LXC / VM grouping table + +| Group | What's inside | Why grouped | LXC or VM | +|---|---|---|---| +| **edge** | Caddy reverse proxy, Crowdsec/Fail2ban | The front door — small, stable, restart rarely | LXC, 1 vCPU, 1GB RAM | +| **identity** | Authentik (+ Postgres + Redis), Vaultwarden | Auth-critical — touch rarely, back up religiously | LXC, 2 vCPU, 2GB RAM | +| **comms** | Mailcow | Mailcow's compose is huge (15+ containers) and self-contained — wants its own host | **VM**, 4GB RAM | +| **automation** | n8n, Windmill (later), Huginn (later) | Active workloads, frequent updates, you'll touch these a lot | LXC, 2–4 vCPU, 4GB RAM | +| **productivity** | Vikunja, Listmonk, Outline, Mealie, Linkwarden | Personal/team productivity, low-resource | LXC, 2 vCPU, 4GB RAM | +| **media** | Immich, Nextcloud, Paperless-ngx | Large storage, GPU passthrough useful for Immich ML | **VM** if GPU passthrough, else LXC. Lots of disk. | +| **business** | Cal.com ✅, Crater | Client-facing, financial — back up often | LXC, 2 vCPU, 2GB RAM | +| **monitoring** | Uptime Kuma ✅, Dockge ✅, Umami ✅, Beszel (later) | Ops stack on LXC **218** | LXC, 2 vCPU, 2GB RAM | +| **labs** | Anything experimental — Flowise, Trigger.dev | Things you're trying out, can be wiped | LXC, scratch space | + +### Why this grouping (cheat sheet) + +- One service goes bad → only its group restarts. +- Need a kernel upgrade for one stack → snapshot the LXC, upgrade, roll back if broken. +- Mailcow's huge surface area is isolated in its own VM. +- Edge LXC is tiny and stable → perfect for the layer everything depends on. +- Backup cadence per group (see Backups section). +- Resource limits per LXC mean a runaway container can't eat n8n's RAM. + +--- + +## Subdomains + +Only expose what actually needs to be public. Internal services use Tailscale/Wireguard for remote access. + +### Expose publicly + +| Subdomain | Service | Group | Why public | Status | +|---|---|---|---|---| +| `levkin.ca` | Company site (spec + `/folders`) | edge | Main brand | ✅ LXC 220 — **DNS must point to home IP** (was parked elsewhere) | +| `caseware.levkin.ca` | Static site | edge | Marketing | ✅ live | +| `auto.levkin.ca` | Static site | edge | Marketing | ✅ live | +| `iliadobkin.com` | Portfolio (SDET) | edge | Personal site | ✅ live (pve10 LXC 219) | +| `cal.levkin.ca` | Cal.com | business | Clients book on it | ✅ live | +| `listmonk.levkin.ca` | Listmonk | productivity | Unsubscribe URLs must resolve | ✅ live | +| `mail.levkine.ca` | Mailcow | comms | Mail server | ✅ live | +| `auth.levkin.ca` | Authentik | identity | OIDC redirect URLs need external resolution | ✅ live | +| `bill.levkin.ca` | Crater | business | Clients view invoices | ⏳ Phase 6 | +| `cloud.levkin.ca` | Nextcloud | media | **Retiring** — decommission VM 201 after cutover | 🗑️ | +| `photos.levkin.ca` | Immich | media | Mobile apps need public hostname | ⏳ Phase 5 | +| `vault.levkin.ca` | Vaultwarden | identity | Mobile clients need public hostname | ⏳ | +| `notes.levkin.ca` | Outline | productivity | Sharing docs with clients | ⏳ | +| `chat.levkin.ca` | Mattermost | comms | Only if inviting outside users | ⏳ optional | + +### Keep internal only (no public DNS, no Caddy block) + +Reachable only via local network or Tailscale/Wireguard: + +| Service | Reason | +|---|---| +| Umami admin UI | Only you need the dashboard. Tracking endpoint can be public, dashboard isn't. | +| Uptime Kuma | Status dashboard is for you. Don't advertise infrastructure. | +| Beszel | Metrics are admin-only. | +| Dockge | Admin UI — local only. | +| n8n editor | UI shouldn't be exposed. Webhooks go on `hooks.levkin.ca` if needed. | +| Huginn / Windmill / Flowise | Admin tools. | +| Vikunja | Personal task manager. | +| Mealie | Family recipes. | +| Trigger.dev | Internal automation. | +| Paperless-ngx | Personal documents. Never expose. | +| SiYuan | Personal knowledge. | +| Linkwarden | Personal bookmarks. | + +### Borderline (decide per service) + +| Subdomain | Service | Notes | +|---|---|---| +| `stats.levkin.ca` | Umami collector | Only the tracking script endpoint needs to be public; admin UI stays internal | +| `status.levkin.ca` | Uptime Kuma | Kuma supports a separate public status page URL — that one can be public | + +--- + +## Phased rollout + +### Phase 0 — Foundation +1. ✅ Caddy running (on VM — migrate to LXC in Phase 1.5) +2. ✅ **Static IP audit (partial)** — all LXCs on pve10 pinned; Caddy VM static `.50`; remaining VMs on stable DHCP — see [host-list.md](host-list.md) +3. ✅ DNS for `auth.levkin.ca` → home IP (verified 2026-05-22) +4. ✅ `identity` LXC **217** @ `10.0.10.21` (2 vCPU, 2GB RAM, 20GB `local-lvm`, Debian 12 + Docker Compose) + +### Phase 1 — Identity ✅ +1. ✅ Deploy Authentik in `identity` LXC (Authentik + Postgres + Redis, official compose at `/opt/authentik`) +2. ✅ Caddy: `auth.levkin.ca` → `10.0.10.21:9000` (simple passthrough, no forward-auth) +3. ✅ Admin user (`admin`), TOTP enrolled +4. ✅ `authentik Admins` group (skip custom `users` group until more accounts) +5. ✅ Static backup codes; **don't OIDC other apps until Cal.com test** + +### Phase 1.5 — Caddy migration to LXC (~30 min) + +Why now (after Phase 1, before bulk SSO work in Phase 4): Authentik is stable enough to absorb a small change, but you haven't yet built the dependency web of OIDC integrations that would make a Caddy reload risky. + +Why Caddy belongs in an LXC, not a VM: +- ~50MB OS overhead vs ~512MB for a VM +- Boot/restart in 2-5s vs 20-40s (matters when reloading config) +- Snapshot/backup is faster +- Caddy is a Go binary doing reverse-proxy work — no need for kernel isolation +- Near-native network performance + +Steps: +1. Create `edge` LXC: Debian 12, 1 vCPU, 512MB RAM, 8GB disk, **static IP from host list** +2. Install Caddy via official Debian repo: + ```bash + apt install -y debian-keyring debian-archive-keyring apt-transport-https + curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg + curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | tee /etc/apt/sources.list.d/caddy-stable.list + apt update && apt install caddy + ``` +3. Copy `Caddyfile` + custom snippets (`(security-headers)` etc.) from the VM +4. Add a **test subdomain** (e.g. `test.levkin.ca`) pointing at the new LXC — verify TLS issues and routing works +5. Cut over: update router port-forward (80/443) to the new LXC IP. DNS A records don't need to change if they point to your home IP. +6. Watch Mailcow, Cal.com, Listmonk, the marketing sites for ~24h +7. Keep the old VM snapshot for a week, then delete + +### Phase 2 — Quick wins ✅ +1. ✅ **Umami** — tracking on levkin.ca, caseware, auto, and iliadobkin.com (portfolio) +2. ✅ **Uptime Kuma** — monitors in UI +3. ✅ **Dockge** — logged in; register `/opt/monitoring` stack (see [monitoring-stack.md](monitoring-stack.md)) +4. ✅ **Kuma email alerts** — SMTP via Mailcow (see [homelab-status-2026-05-22.md](homelab-status-2026-05-22.md)) + +### Phase 3 — Cal.com (mostly done) ✅ +1. ✅ Cal.com deployed in `business` LXC (id 210, Postgres included) +2. ✅ `cal.levkin.ca` proxied via Caddy +3. ✅ Booking link live at `cal.levkin.ca/ilia/consult` with Jitsi location +4. ✅ Email working via `cal@levkine.ca` SMTP through Mailcow +5. ⏳ **Wire Cal.com to Authentik via OIDC** (first real SSO connection — do this after Phase 1) +6. ⏳ Update `auto.levkin.ca` button → `cal.levkin.ca/ilia/consult` (currently points to placeholder) + +### Phase 4 — SSO migration (~half a day, staged) +Wire each to Authentik, least-risky first: +1. **Vikunja** (OIDC native) — easy, single-user impact +2. ~~**Nextcloud**~~ — **skipped** (VM 201 retiring) +3. **Listmonk** (OIDC native, admin only) — easy +4. **Mattermost** (SAML or OIDC native) — moderate +5. **Mailcow** (OIDC) — last, because mail-critical + +For each: keep a local admin password as a break-glass account. + +### Phase 5 — Family / personal wins (~1 evening) +1. **Immich** in `media` VM — install mobile apps for you and family, enable auto-upload. Face recognition runs in background; "my kids 2024" works within a couple days. +2. Skip PhotoPrism — Immich covers it. + +### Phase 6 — Business / consulting (~1–2 evenings) +1. **Crater** in `business` LXC — tax rates, company info, Stripe integration if you want online payment +2. **Beszel** hub in `monitoring` LXC + agents on each LXC — one dashboard for resource usage + +### Phase 7 — Automation depth (ongoing) +Only when you have a real use case: +1. **Huginn** in `automation` — first agent: competitor pages, kosher product availability, grant deadlines +2. **Windmill** in `automation` — first script: rewrite an n8n flow with too many code nodes +3. **Flowise** in `labs` — first flow: chat-with-docs against your consulting notes + +### Phase 8 — Knowledge / research +1. **Outline** in `productivity` LXC — client-facing wiki + your notes +2. **Linkwarden** in `productivity` LXC — bookmarks with full-page archive +3. **Paperless-ngx** in `media` — scan and OCR the paper that's accumulating +4. **SiYuan** — only if/when PhD or long-form research becomes relevant + +--- + +## Static IP audit + +**Maintain a `host-list.md` file** (in this Cursor project, alongside this plan) with every LXC/VM, its current IP, its target static IP, and DHCP/static status. Cursor will use this as the source of truth when scripting changes. + +Suggested format: + +| LXC/VM ID | Name | Role | Current IP | Target static IP | DHCP/Static | Notes | +|---|---|---|---|---|---|---| +| 210 | cal | Cal.com | 10.0.10.228/24 (DHCP) | 10.0.10.228/24 | ⏳ static | Convert ASAP | +| ... | ... | ... | ... | ... | ... | ... | + +### Recommended IP plan + +Use `/24` subnets within `10.0.10.0/24` (or whatever your LAN is) with role-based ranges so it's scannable: + +| Range | Reserved for | +|---|---| +| `.1 - .9` | Network gear (router, switches, APs) | +| `.10 - .19` | Proxmox host(s) + PBS | +| `.20 - .39` | Edge / identity / comms (critical infra) | +| `.40 - .79` | Application LXCs (productivity, automation, business, monitoring) | +| `.80 - .99` | Media VM(s) | +| `.100 - .199` | DHCP pool (clients, phones, laptops) | +| `.200 - .249` | Labs / experimental | +| `.250 - .254` | Reserved | + +### How to set static on a Proxmox LXC + +Two methods — pick one and stick with it: + +**Method A — Proxmox CLI (recommended, survives reboots cleanly):** +```bash +pct set -net0 name=eth0,bridge=vmbr0,ip=10.0.10.X/24,gw=10.0.10.1 +pct reboot +``` + +**Method B — Router DHCP reservation:** +- Reserve the IP in your router's DHCP table by MAC address. LXC stays "DHCP" technically, but always gets the same IP. +- Easier if you have many hosts and one router. +- Risk: if the LXC's MAC changes (rebuild from snapshot to new ID), reservation breaks. + +**Recommendation:** Method A (`pct set`) for everything critical (edge, identity, comms, business). Method B is fine for labs/experimental LXCs. + +### Audit checklist + +1. List every LXC: `pct list` +2. List every VM: `qm list` +3. For each, run `pct exec -- ip a` (or `qm guest exec -- ip a` for VMs) and check whether the IP came from DHCP +4. Fill in `host-list.md` +5. Pick target IPs from the range plan above +6. Convert one at a time, lowest-risk first (labs → productivity → business → comms → identity → edge) +7. **After each conversion**, verify the Caddy reverse-proxy entry still works (curl from outside) +8. Update `host-list.md` status column + +### Hosts known to need conversion right now + +- ~~**LXC 210 (cal)**~~ — static at `10.0.10.228` ✅ +- **Site LXCs 220, 215/216/219** — static; served via Caddy → nginx on each LXC (git deploy). Optional future: static files on Caddy VM only. + +--- + +## Backlog (priority order) + +### P0 — next (Phase 1–2 largely ✅) +1. ~~Umami~~ ✅ +2. ~~Uptime Kuma~~ ✅ +3. ~~Dockge~~ ✅ +4. **Cal.com → Authentik OIDC** — first SSO +5. **Retire Nextcloud VM 201** + **Portainer VM 109** — frees **~24 GiB** on pve10 +6. **Beszel** — fits on monitoring LXC 218 or small agent LXCs +7. **Mealie** — new small LXC on pve10 (~2 GB) + +### P1 — when ready +- **Outline** — wiki for client docs +- **Linkwarden** — bookmarks with full-page archive +- **Plane** — Jira-lite project management (pair with Mattermost) + +### P2 — when you have a real need +- **Crater** — invoicing (Phase 6) +- **Immich** — photos (Phase 5) +- **Paperless-ngx** — document scanning (Phase 8) +- **Huginn** — first when you have a monitoring use case +- **Windmill** — when n8n hits limits +- **Trigger.dev** — durable background jobs in code (better fit than Windmill for QA work) +- **PrivateBin** — encrypted paste for sharing secrets with contractors +- **Addy.io** — email aliases +- **SiYuan** — if PhD work picks up +- **Flowise** — labs only, when LLM workflow use case appears + +### Skip / declined +- ~~PhotoPrism~~ — Immich covers it +- ~~Activepieces~~ — you already have n8n +- ~~Affine / Trilium~~ — picked Outline + SiYuan instead +- ~~Matrix/Synapse + Element~~ — staying on Mattermost +- ~~Coolify / Dokploy / CapRover~~ — Dockge is enough; revisit only if writing many custom apps + +--- + +## Backup strategy + +- **Proxmox Backup Server (PBS)** or `vzdump` to a NAS — snapshot each LXC/VM nightly +- **Critical groups** (`identity`, `comms`, `business`): 7 daily + 4 weekly + 12 monthly +- **Productivity/automation**: 7 daily + 4 weekly +- **Labs**: 3 daily, no long retention +- **Off-site copy** of `identity` and `business` LXCs — these contain auth and billing data. Encrypted copy to Wasabi or Backblaze B2. + +The whole LXC gets snapshotted — much simpler than file-level container backup. + +**Done on pve10 (2026-05-22):** `pct snapshot` **`backup-20260522`** on LXCs **217** (identity) and **218** (monitoring). + +--- + +## Next steps (priority order) + +See **[homelab-status-2026-05-22.md](homelab-status-2026-05-22.md)** for automation checklist. + +| # | Task | Status | Effort | Frees / unlocks | +|---|------|--------|--------|-----------------| +| 1 | **Kuma SMTP** | ✅ done | — | — | +| 2 | **Cal.com → Authentik OIDC** | ⏳ **next** | 1–2 h | First SSO; test before Vikunja/Listmonk | +| 3 | **auto.levkin.ca** → Cal booking link | ⏳ | 15 min | Phase 3 item 6 | +| 4 | **Stop Portainer VM 109** | ⏳ | 10 min | **~16 GiB RAM** on pve10 | +| 5 | **Retire Nextcloud VM 201** | ⏳ | 30 min | **~8 GiB RAM**; remove Caddy + Kuma monitor | +| 6 | **UniFi DHCP reservations** | ⏳ | 20 min | [unifi-static-dhcp.md](unifi-static-dhcp.md) | +| 7 | **Beszel** on 218 or agents | ⏳ | 1 h | Capacity visibility before Immich | +| 8 | **NAS.SP00** disk → Jellyfin | ⏳ hardware | — | VM 101 | +| 9 | **Caddy → edge LXC `.20`** | ⏳ defer | ~30 min | Phase 1.5 | +| 10 | **dev-apps LXC** (pote, mirrormatch, then punim) | ⏳ defer | half day | pve201 RAM; punim **last** | +| 11 | **Static sites → Caddy VM** (optional) | ⏳ defer | 1 h | ~1 GiB; breaks git-on-LXC workflow unless you move deploy to Caddy | + +**Defer:** Immich, Crater, Outline, Plane, SSO for Vikunja/Listmonk/Mailcow until rows 2–5 done. + +### Adding a new service — quick rule + +| Want to add… | Node | RAM budget | Prerequisite | +|--------------|------|------------|--------------| +| Small app (Mealie, Linkwarden) | pve10 | 2 GB LXC | Stop 109 and/or 201 first if host feels tight | +| Medium (Outline, Crater) | pve10 | 4 GB LXC | Free **~24 GiB** via Portainer + Nextcloud retire | +| Heavy (Immich + ML) | pve10 or pve201 GPU | 4–8 GB+ | NAS healthy; pve201 only after GPU/punim sized down | +| Dev sandbox | pve10 `dev-apps` | 6–8 GB | punim 9101 migration only after testing | + +### Nextcloud decommission (VM 201) + +1. Confirm export in `exports/nextcloud-2026-05-21/` is complete +2. Delete **Nextcloud** monitor in Kuma +3. Remove `nextcloud.levkin.ca` from Caddy VM +4. Stop VM 201; update [host-list.md](host-list.md) +5. After NAS healthy: optional `vzdump` archive then delete disk + +--- + +## Important rules + +1. **Never put Authentik behind itself.** `auth.levkin.ca` is a simple Caddy passthrough — no forward-auth, no fancy dependencies. If Authentik goes down, you'd lose access to Authentik. +2. **Vaultwarden stays standalone.** It's your break-glass path if Authentik dies. Don't OIDC it. +3. **Keep a local admin password on every SSO-wired app.** OIDC integrations break during upgrades — you need to log in to fix them. +4. **Local admin to Proxmox host.** Independent of Authentik and Vaultwarden. Written down somewhere physical. +5. **Don't expose admin UIs publicly.** Dockge, Beszel, Uptime Kuma admin, n8n editor — use Tailscale or Wireguard for remote access. +6. **Static IPs for every LXC.** DHCP will eventually move them and Caddy will break. Set via `pct set -net0 ...ip=10.0.10.X/24,gw=...` or a router reservation. +7. **Cal.com LXC (210)** — static at `.228` ✅. +8. **Maintain `host-list.md`** as the single source of truth for IPs. Update it whenever a new LXC/VM is created or migrated. diff --git a/docs/guides/monitoring-stack.md b/docs/guides/monitoring-stack.md new file mode 100644 index 0000000..f45152e --- /dev/null +++ b/docs/guides/monitoring-stack.md @@ -0,0 +1,232 @@ +# Monitoring stack (LXC 218) + +**Host:** `monitoring` @ `10.0.10.22` (PVENAS pve10, VMID **218**) +**Compose:** `/opt/monitoring/compose.yml` +**Stacks dir (Dockge):** `/opt/stacks` + +All admin UIs are **LAN-only** (no public Caddy blocks). Use Tailscale or local network. + +| Service | URL | Port | Notes | +|---------|-----|------|-------| +| **Uptime Kuma** | http://10.0.10.22:3001 | 3001 | Admin + monitors configured ✅ (replaces pve201 LXC **305** @ `.197`, stopped) | +| **Dockge** | http://10.0.10.22:5001 | 5001 | Manage compose on **this LXC only** | +| **Umami** | http://10.0.10.22:3000 | 3000 | Password changed ✅; levkin.ca + caseware + auto + portfolio tracked | + +Secrets: `/opt/monitoring/.env` on the LXC (mode 600). Not in git. + +--- + +## Backups (pve10) + +| Guest | VMID | Snapshot | Date | +|-------|------|----------|------| +| identity | 217 | `backup-20260522` | 2026-05-22 | +| monitoring | 218 | `backup-20260522` | 2026-05-22 | + +On pve10: + +```bash +pct listsnapshot 217 +pct listsnapshot 218 +# Rollback if needed: +# pct rollback 217 backup-20260522 +``` + +Optional off-node copy (when NAS healthy): `vzdump 217 218 --storage local --mode snapshot --compress zstd` + +--- + +## Uptime Kuma — monitors + +Configured in UI (all green). **Remove** the Nextcloud monitor when VM 201 is retired. + +| Name | URL | +|------|-----| +| Authentik | https://auth.levkin.ca | +| Cal.com | https://cal.levkin.ca | +| Caseware / Auto | marketing sites | +| Mailcow | https://mail.levkine.ca | +| Listmonk, Gitea, Vault, Todo, PVE nodes | per your dashboard | + +--- + +## Uptime Kuma — email alerts (Mailcow) + +Mail domain is **`levkine.ca`** (with **e**). Cal.com already sends via Mailcow as `cal@levkine.ca`. + +### Which email to use + +| Role | Address | Notes | +|------|---------|-------| +| **SMTP server** | `mail.levkine.ca` | Mailcow host | +| **SMTP port** | `587` | STARTTLS (not 465 unless you prefer SMTPS) | +| **From (sender)** | `alerts@levkine.ca` | Create mailbox in Mailcow if it does not exist | +| **To (you)** | `idobkin@gmail.com` or `ilia@levkine.ca` | Use whichever you read; Gmail is fine for alerts | + +### 1. Create mailbox in Mailcow (if needed) + +**Automated (needs Mailcow API key):** + +```bash +# Define mailbox in group_vars/all/mailcow.yml, password in vault: +make mailcow-mailbox MAILBOX=alerts +# (alias: make mailcow-create-alerts) + +# Import from .env into vault once, then delete .env: +cp .env.example .env # MAILCOW_API_KEY=... ALERTS_PASSWORD=... +make vault-import-env +make mailcow-mailbox MAILBOX=alerts +``` + +To add another mailbox tomorrow: edit `mailcow.yml` + `vault_mailcow_mailbox_passwords.`, then `make mailcow-mailbox MAILBOX=`. + +**Manual UI:** + +1. https://mail.levkine.ca → admin login +2. **Email → Mailboxes → Add** → `alerts@levkine.ca` (strong password → store in Vaultwarden) +3. Optional: alias `monitoring@levkine.ca` → same inbox + +### 2. Add notification in Kuma + +**Automated (from your Mac, after mailbox exists):** + +```bash +cd /path/to/ansible +pip install uptime-kuma-api # or: .venv/bin/pip install uptime-kuma-api +export KUMA_URL=http://10.0.10.22:3001 KUMA_USER=admin KUMA_PASSWORD='...' +export SMTP_USER=alerts@levkine.ca SMTP_PASS='...' SMTP_TO=idobkin@gmail.com +./scripts/kuma-setup-smtp.sh +``` + +**Manual UI:** + +1. http://10.0.10.22:3001 → **Settings** → **Notifications** → **Setup Notification** +2. Type: **Email (SMTP)** +3. Fill in: + + | Field | Value | + |-------|--------| + | SMTP Host | `mail.levkine.ca` | + | SMTP Port | `587` | + | Security | TLS / STARTTLS | + | Username | `alerts@levkine.ca` | + | Password | mailbox password | + | From Email | `alerts@levkine.ca` | + | To Email | `idobkin@gmail.com` (or your `@levkine.ca`) | + +4. **Test** → save +5. Edit each monitor (or default) → **Notifications** → enable this channel + +**Alternative:** Mattermost webhook (`slack.levkin.ca`) if you prefer chat over email. + +--- + +## Dockge — what to do after login + +**On server today:** + +| Path | Contents | +|------|----------| +| `/opt/monitoring/compose.yml` | **Live** stack (Docker project `monitoring`, 4 containers running) | +| `/opt/stacks/monitoring/compose.yaml` | Copy for Dockge (same services) | +| `/opt/stacks/authentik-ref/`, `cal-ref/` | README only — **no** compose file (ignore) | + +**Why “Scan Stacks Folder” looks empty** + +- Scan only picks up folders under **`/opt/stacks`** that contain `compose.yaml` / `compose.yml`. +- Your containers were started from **`/opt/monitoring`**, so Docker does not automatically link them to `/opt/stacks/monitoring` until you register that folder in Dockge. + +**Fix (pick one):** + +### Dockge UI note (your version) + +**Settings → General** only has hostname — there is **no “Stacks directory” field**. That path is fixed at deploy time: + +`DOCKGE_STACKS_DIR=/opt/stacks` (already set in `/opt/monitoring/compose.yml`). + +Stacks are managed from the **home / dashboard** page, not Settings. + +### Option 1 — Add stack manually (recommended) + +1. http://10.0.10.22:5001 → **home** (logo / dashboard, not Settings) +2. **+ Create Stack** (or **Compose** → new stack) +3. Name: `monitoring` +4. Path: `/opt/stacks/monitoring` (must contain `compose.yaml`) +5. Open stack → review compose → **do not Start** until old project is stopped (below) + +### Option 2 — Scan from dashboard menu + +1. Stay on **dashboard** (not Settings) +2. Top-right **⋮** → **Scan Stacks Folder** +3. Pick **`monitoring`** if it appears (`authentik-ref` / `cal-ref` have no compose — ignore) + +**Avoid duplicate containers** + +Before starting from Dockge: + +```bash +ssh root@10.0.10.22 +cd /opt/monitoring && docker compose down +# Then start from Dockge UI on stack monitoring, OR: +cd /opt/stacks/monitoring && docker compose --env-file .env up -d +``` + +Until you do that, Kuma/Dockge/Umami keep running from `/opt/monitoring`; Dockge is optional for edits until cutover. + +### Optional reference stacks (read-only) + +Create empty stacks under `/opt/stacks/` only if you want a UI placeholder: + +```bash +ssh root@10.0.10.22 +mkdir -p /opt/stacks/authentik /opt/stacks/cal +# Copy compose for reference (does NOT control remote host): +scp root@10.0.10.21:/opt/authentik/compose.yml /opt/stacks/authentik/ +``` + +To **manage** Authentik or Cal from Dockge long term, either move compose to 218 (not recommended) or install Dockge on each LXC later. + +### Step 3 — Retire Portainer + +When comfortable: stop VM **109** (portainer) on pve10; use Dockge on 218 instead. + +--- + +## Umami + +- ✅ Running at http://10.0.10.22:3000 (LAN / Tailscale only) +- ✅ **Public tracking** via `https://stats.levkin.ca/script.js` on **levkin.ca** (LXC 220), caseware, auto, and **iliadobkin.com** (portfolio LXC 219) + +**Three choices (pick one later; none block the sites):** + +| Option | Effort | Notes | +|--------|--------|--------| +| **A — Skip public analytics** | 0 | Use Umami dashboard on `:3000` when you care; no DNS/Caddy | +| **B — One DNS + Caddy block** | ~10 min | A record → home IP + Caddy `reverse_proxy 10.0.10.22:3000` on caddy VM | +| **C — Re-add script tags** | 2 min | After B works, insert script before `` on 215/216 | + +**Suggested public hostname (instead of `analytics`):** `stats.levkin.ca` (short, clear). Alternatives: `umami.levkin.ca`, `metrics.levkin.ca`. + +```caddy +stats.levkin.ca { + import security-headers + encode gzip + reverse_proxy 10.0.10.22:3000 +} +``` + +Script tag then: `https://stats.levkin.ca/script.js` + +We are **not stuck** — marketing sites do not need Umami to render. Option A is fine for now. + +--- + +## Maintenance + +```bash +ssh root@10.0.10.22 +cd /opt/monitoring +docker compose --env-file .env pull +docker compose --env-file .env up -d +docker compose ps +``` diff --git a/docs/guides/nas-sp00-drive-failure-report.md b/docs/guides/nas-sp00-drive-failure-report.md new file mode 100644 index 0000000..1bbbb4e --- /dev/null +++ b/docs/guides/nas-sp00-drive-failure-report.md @@ -0,0 +1,203 @@ +# NAS.SP00 drive failure — IT report + +**Date:** 2026-05-21 +**Host:** PVENAS (Proxmox VE) — `10.0.10.10` +**Pool:** ZFS `NAS.SP00` (~9 TB, ~862 GB used) +**Prepared for:** IT / hardware replacement +**SMART audit:** [nas-sp00-smart-audit-2026-05-21.md](nas-sp00-smart-audit-2026-05-21.md) + +--- + +## Executive summary + +One disk in a four-drive ZFS mirror pair has **failed at the hardware level**. The pool is **DEGRADED** but **online** with **no known data errors** at this time. The failed drive must be **physically replaced** and the pool **resilvered**. Until then, **mirror-0 has no redundancy** — a second failure on the remaining disk in that mirror (`W4J0L0BA`) could cause data loss. + +This issue also caused a **host-wide I/O wedge** (pool SUSPENDED → stuck `sync()`), which blocked LXC/VM operations unrelated to the pool (e.g. Cal.com on `local-lvm`). That was cleared by a forced node reboot; **replacing the drive remains required**. + +--- + +## Pool layout + +| Vdev | Role | Disk A | Disk B | Status | +|------|------|--------|--------|--------| +| mirror-0 | RAID1 pair | `W4J0L0BA` (sda, 5 TB) | `W4J0L3PY` (sdb) | **DEGRADED** — sdb UNAVAIL | +| mirror-1 | RAID1 pair | `W4J0LKCD` (sdd, 5 TB) | `W4J0K9V7` (sdc, 5 TB) | **ONLINE** | + +Model family (healthy drives): Seagate **ST5000DM000-1FK178** (5 TB, 7200 RPM). + +--- + +## Failed drive identification + +| Field | Expected | Observed | +|-------|----------|----------| +| **Serial** | W4J0L3PY | W4J0L3PY | +| **Model** | ST5000DM000-1FK178 | ST5000DM000 (truncated reporting) | +| **WWN** | — | `5000c50082cc8bbb` | +| **Firmware** | — | CC48 | +| **Capacity** | ~5,000,981,078,016 bytes (**5.00 TB**) | **137,438,952,960 bytes (~137 GB)** | +| **Linux device** | `/dev/sdb` | `/dev/sdb` | +| **ZFS state** | ONLINE | **UNAVAIL** — label missing/invalid | + +ZFS last known path: +`/dev/disk/by-id/ata-ST5000DM000-1FK178_W4J0L3PY-part1` + +--- + +## Symptoms and evidence + +### 1. Capacity collapse (primary indicator) + +The drive is detected as **~137 GB** instead of **5 TB**. ZFS cannot use a partition label created for a 5 TB disk on a device that exposes only a tiny fraction of capacity. This pattern is typical of: + +- **Failed HDD** (media/controller failure) +- **Bad SATA cable, backplane port, or HBA port** +- **USB/SATA bridge failure** (if applicable) +- **Severe firmware/HPA corruption** (less common) + +### 2. SMART / SCSI errors + +`smartctl` against `/dev/sdb`: + +- **Read SMART Data failed:** scsi error aborted command +- **Overall health:** UNKNOWN (attributes unreadable) +- Multiple log read commands fail (Error Log, Self-test Log, GP Log, etc.) + +Healthy sibling in same mirror (`/dev/sda`, W4J0L0BA): **SMART PASSED**, full 5 TB capacity. + +### 3. Kernel log (`dmesg` at boot, 2026-05-21 ~21:27) + +Repeated on **`sdb`**: + +``` +Buffer I/O error on dev sdb +Sense Key: Medium Error +Add. Sense: Unrecovered read error +critical medium error, dev sdb, sector N op 0x0:(READ) +``` + +Indicates the block device cannot reliably read media — **hardware or link layer**, not a ZFS configuration issue. + +### 4. ZFS pool history + +- Pool previously entered **SUSPENDED** state (I/O failures on faulted devices). +- After node reboot: pool **DEGRADED**, short **resilver** completed with **0 errors** (healing scan on remaining devices). +- Current: **No known data errors** in `zpool status`. + +--- + +## Impact + +### Storage / services on `NAS.SP00` + +Proxmox guests with disks on this pool (non-exhaustive): + +| VMID | Name | NAS-backed storage | +|------|------|-------------------| +| 101 | Jellyfin | 1 TB zvol | +| 105 | TrueNAS | 1 TB zvol | +| 108 | actual-debian | 10 GB | +| 200 | PVE.BU.SVR | 1 TB | +| 201 | NextcloudAIO-debian | 8 TB | + +**Risk:** With mirror-0 degraded, blocks stored only on the surviving mirror-0 disk have **no redundancy** until the failed drive is replaced and resilver completes. + +### Unrelated workloads + +Guests on **`local-lvm`** (NVMe, e.g. Cal.com LXC 210, Caddy VM 106) are **not stored on NAS.SP00** but were affected when the pool suspended and blocked system-wide `sync()`. + +### Backup target + +Proxmox datastore **PVEBUVD00** (PBS @ `10.0.10.200:8007`) reports **unreachable** from this node — separate issue; verify PBS host/network. + +--- + +## Diagnosis + +| Question | Answer | +|----------|--------| +| Is this a ZFS misconfiguration? | **No** — config is consistent; three drives show correct 5 TB labels. | +| Is the pool lost? | **No** — degraded but importable; no known data errors currently. | +| Which disk to replace? | **Seagate W4J0L3PY** (`/dev/sdb`, mirror-0 failed leg). | +| Can we fix it in software? | **Unlikely** — capacity and SMART failures point to hardware. | +| Safe to reseat first? | **Optional trial** — power down or hot-swap per chassis policy; if capacity still reads ~137 GB, **replace disk**. | + +--- + +## Recommended actions + +### Immediate (IT / on-site) + +1. **Identify physical slot** for serial **W4J0L3PY** (compare to inventory/asset tags). +2. **Reseat** SATA/SAS cable and backplane connection once (if hot-swap policy allows). Reboot or rescan SCSI bus. +3. If capacity is still wrong or SMART still fails → **replace with new 5 TB+ enterprise/NAS-class HDD** (match class of ST5000DM000 or better). +4. Do **not** remove the UNAVAIL device from the pool until replacement is in place. + +### After new disk is installed + +On **PVENAS** as root (adjust `/dev/disk/by-id/...` to the **new** drive’s partition 1): + +```bash +# Verify new disk shows ~5 TB +lsblk /dev/sdX +smartctl -H /dev/sdX + +# Replace failed vdev (use ID from: zpool status NAS.SP00) +zpool replace NAS.SP00 ata-ST5000DM000-1FK178_W4J0L3PY-part1 /dev/disk/by-id/ata-NEW_SERIAL-part1 + +# Monitor until resilver completes +zpool status -v NAS.SP00 +``` + +### Post-resilver + +- Run **`zpool scrub NAS.SP00`** during a maintenance window. +- Confirm **PVEBUVD00** / PBS connectivity if backups depend on it. +- Review whether **Nextcloud VM 201** (8 TB on degraded pool) should remain running until healthy. + +### Not recommended + +- Ignoring degraded state for extended periods. +- Running heavy I/O on large VMs (e.g. 8 TB Nextcloud) during extended degraded operation. +- `zpool clear` without addressing hardware — does not fix a dead disk. + +--- + +## Reference — healthy disks (for spare matching) + +| Serial | Device | Capacity | SMART | +|--------|--------|----------|-------| +| W4J0L0BA | sda | 5.00 TB | PASSED | +| W4J0K9V7 | sdc | 5.00 TB | PASSED | +| W4J0LKCD | sdd | 5.00 TB | PASSED | + +--- + +## Timeline (brief) + +| When | Event | +|------|--------| +| Prior to 2026-05-21 | `W4J0L3PY` accumulated read/write errors; pool faulted | +| 2026-05-21 | Pool **SUSPENDED**; host `sync()` wedged; Cal LXC start failed | +| 2026-05-21 ~21:28 | Forced node reboot; pool **DEGRADED**, resilver finished, 0 errors | +| 2026-05-21 | `sdb` still reports **~137 GB**, UNAVAIL — **replacement still required** | + +--- + +## Contact / handoff notes + +- **Node:** Proxmox VE 8.x on **PVENAS** (`10.0.10.10`) +- **Pool name in Proxmox:** `NAS.SP00` (zfspool, active, degraded) +- **Failed serial:** **W4J0L3PY** +- **Replacement type:** 5 TB+ HDD, same or better class as Seagate ST5000DM000-1FK178 + +For questions about homelab service impact (Cal, Caddy, Phase 0 rollout), see [`levkin-selfhost-plan-2.md`](levkin-selfhost-plan-2.md). +## TL;DR + +- Pool `NAS.SP00` on `PVENAS` (10.0.10.10) had a disk failure (`W4J0L3PY`) +- Pool went **SUSPENDED**; required forced reboot and is now **DEGRADED** +- **Immediate action:** Replace the failed drive with a spare (same or larger size; see healthy serials in table below) +- Use `zpool replace` command with correct device paths (see main procedure) +- Monitor resilver to completion; run `zpool scrub` after +- Backup services and large VMs (e.g. Nextcloud 8TB) depend on pool health—keep degraded time short +- Reach out if unsure about pool status or downstream service risk \ No newline at end of file diff --git a/docs/guides/nas-sp00-smart-audit-2026-05-21.md b/docs/guides/nas-sp00-smart-audit-2026-05-21.md new file mode 100644 index 0000000..0d52576 --- /dev/null +++ b/docs/guides/nas-sp00-smart-audit-2026-05-21.md @@ -0,0 +1,232 @@ +# NAS.SP00 SMART audit + +**Date:** 2026-05-21 +**Host:** PVENAS (Proxmox VE) — `10.0.10.10` +**Pool:** ZFS `NAS.SP00` +**Related:** [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md) + +--- + +## Executive summary + +| Serial | Device | Capacity | ZFS (mirror) | SMART health | +|--------|--------|----------|--------------|--------------| +| W4J0L0BA | sda | 5.00 TB | mirror-0 ONLINE | **PASSED** | +| W4J0L3PY | sdb | **137 GB** | mirror-0 UNAVAIL | **UNKNOWN** (read fails) | +| W4J0K9V7 | sdc | 5.00 TB | mirror-1 ONLINE | **PASSED** | +| W4J0LKCD | sdd | 5.00 TB | mirror-1 ONLINE | **PASSED** | + +Pool state at audit time: **DEGRADED** — failed leg `W4J0L3PY` (`/dev/sdb`). No known data errors. Three healthy drives show no reallocated, pending, or uncorrectable sectors. + +--- + +## ZFS pool status + +``` + pool: NAS.SP00 + state: DEGRADED +status: One or more devices could not be used because the label is missing or + invalid. Sufficient replicas exist for the pool to continue + functioning in a degraded state. +action: Replace the device using 'zpool replace'. + scan: resilvered 0B in 00:00:01 with 0 errors on Thu May 21 21:27:54 2026 + + NAME STATE READ WRITE CKSUM + NAS.SP00 DEGRADED 0 0 0 + mirror-0 DEGRADED 0 0 0 + ata-ST5000DM000-1FK178_W4J0L0BA ONLINE 0 0 0 + 11449632222283419591 UNAVAIL 0 0 0 was /dev/disk/by-id/ata-ST5000DM000-1FK178_W4J0L3PY-part1 + mirror-1 ONLINE 0 0 0 + ata-ST5000DM000-1FK178_W4J0LKCD ONLINE 0 0 0 + ata-ST5000DM000-1FK178_W4J0K9V7 ONLINE 0 0 0 + +errors: No known data errors +``` + +--- + +## Block devices (`lsblk`) + +| NAME | SIZE | MODEL | SERIAL | ROTA | +|------|------|-------|--------|------| +| sda | 4.5T | ST5000DM000-1FK178 | W4J0L0BA | 1 | +| sdb | 3.9G | ST5000DM000 | W4J0L3PY | 1 | +| sdc | 4.5T | ST5000DM000-1FK178 | W4J0K9V7 | 1 | +| sdd | 4.5T | ST5000DM000-1FK178 | W4J0LKCD | 1 | + +--- + +## Healthy drives — key metrics + +| Metric | sda (W4J0L0BA) | sdc (W4J0K9V7) | sdd (W4J0LKCD) | +|--------|----------------|----------------|----------------| +| Model | ST5000DM000-1FK178 | ST5000DM000-1FK178 | ST5000DM000-1FK178 | +| Firmware | CC48 | CC48 | CC48 | +| WWN | 5000c500082c02f61 | 5000c500082c7e2ce | 5000c500082d84c45 | +| Rotation | 5980 rpm | 5980 rpm | 5980 rpm | +| SATA | 3.1 @ 6.0 Gb/s | 3.1 @ 6.0 Gb/s | 3.1 @ 6.0 Gb/s | +| Power-on hours | 52,481 (~6.0 y) | 53,087 (~6.1 y) | 45,580 (~5.2 y) | +| Temperature | 27 °C | 30 °C | 30 °C | +| Reallocated sectors | 0 | 0 | 0 | +| Current pending sectors | 0 | 0 | 0 | +| Offline uncorrectable | 0 | 0 | 0 | +| UDMA CRC errors | 0 | 0 | 0 | +| Start/stop count | 350 | 367 | 310 | +| Load cycle count | 348,974 | 340,961 | 184,891 | +| Power cycle count | 345 | 363 | 309 | + +High **Load_Cycle_Count** on Seagate Desktop HDD.15 is common (head parking); not alarming when reallocated/pending counts remain zero. + +--- + +## Failed drive — `/dev/sdb` (W4J0L3PY) + +### Identity + +| Field | Value | +|-------|-------| +| Device Model | ST5000DM000 (truncated; not full -1FK178 suffix) | +| Serial | W4J0L3PY | +| WWN | 5000c500082cc8bbb | +| Firmware | CC48 | +| User capacity | 137,438,952,960 bytes [**137 GB**] | +| Expected capacity | 5,000,981,078,016 bytes [5.00 TB] | +| Rotation | 7200 rpm (reported) | +| SATA | 3.0, 6.0 Gb/s | + +### SMART + +``` +Read SMART Data failed: scsi error aborted command +SMART Status command failed: scsi error aborted command +SMART overall-health self-assessment test result: UNKNOWN! +SMART Status, Attributes and Thresholds cannot be read. +``` + +**Action:** Replace drive; see [nas-sp00-drive-failure-report.md](nas-sp00-drive-failure-report.md). + +--- + +## Full SMART attributes (healthy drives) + +### `/dev/sda` — W4J0L0BA (mirror-0, ONLINE) + +``` +SMART overall-health self-assessment test result: PASSED + +ID# ATTRIBUTE_NAME VALUE WORST THRESH TYPE RAW_VALUE + 1 Raw_Read_Error_Rate 119 100 006 Pre-fail 211189952 + 3 Spin_Up_Time 092 091 000 Pre-fail 0 + 4 Start_Stop_Count 100 100 020 Old_age 350 + 5 Reallocated_Sector_Ct 100 100 010 Pre-fail 0 + 7 Seek_Error_Rate 080 060 030 Pre-fail 43979429424 + 9 Power_On_Hours 041 041 000 Old_age 52481 + 10 Spin_Retry_Count 100 100 097 Pre-fail 0 + 12 Power_Cycle_Count 100 100 020 Old_age 345 +183 Runtime_Bad_Block 100 100 000 Old_age 0 +184 End-to-End_Error 100 100 099 Old_age 0 +187 Reported_Uncorrect 100 100 000 Old_age 0 +188 Command_Timeout 100 099 000 Old_age 3 3 3 +189 High_Fly_Writes 100 100 000 Old_age 0 +190 Airflow_Temperature_Cel 073 058 045 Old_age 27 (Min/Max 27/28) +191 G-Sense_Error_Rate 100 100 000 Old_age 0 +192 Power-Off_Retract_Count 100 100 000 Old_age 0 +193 Load_Cycle_Count 001 001 000 Old_age 348974 +194 Temperature_Celsius 027 042 000 Old_age 27 +195 Hardware_ECC_Recovered 119 100 000 Old_age 211189952 +197 Current_Pending_Sector 100 100 000 Old_age 0 +198 Offline_Uncorrectable 100 100 000 Old_age 0 +199 UDMA_CRC_Error_Count 200 200 000 Old_age 0 +240 Head_Flying_Hours 100 253 000 Old_age 15140h+51m+12.276s +241 Total_LBAs_Written 100 253 000 Old_age 57665101118 +242 Total_LBAs_Read 100 253 000 Old_age 160962549062 +``` + +### `/dev/sdc` — W4J0K9V7 (mirror-1, ONLINE) + +``` +SMART overall-health self-assessment test result: PASSED + +ID# ATTRIBUTE_NAME VALUE WORST THRESH TYPE RAW_VALUE + 1 Raw_Read_Error_Rate 117 100 006 Pre-fail 136042192 + 3 Spin_Up_Time 092 091 000 Pre-fail 0 + 4 Start_Stop_Count 100 100 020 Old_age 367 + 5 Reallocated_Sector_Ct 100 100 010 Pre-fail 0 + 7 Seek_Error_Rate 083 060 030 Pre-fail 22512744055 + 9 Power_On_Hours 040 040 000 Old_age 53087 + 10 Spin_Retry_Count 100 100 097 Pre-fail 0 + 12 Power_Cycle_Count 100 100 020 Old_age 363 +183 Runtime_Bad_Block 100 100 000 Old_age 0 +184 End-to-End_Error 100 100 099 Old_age 0 +187 Reported_Uncorrect 100 100 000 Old_age 0 +188 Command_Timeout 100 099 000 Old_age 6 6 12 +189 High_Fly_Writes 096 096 000 Old_age 4 +190 Airflow_Temperature_Cel 070 060 045 Old_age 30 (Min/Max 28/30) +191 G-Sense_Error_Rate 100 100 000 Old_age 0 +192 Power-Off_Retract_Count 100 100 000 Old_age 0 +193 Load_Cycle_Count 001 001 000 Old_age 340961 +194 Temperature_Celsius 030 040 000 Old_age 30 +195 Hardware_ECC_Recovered 117 100 000 Old_age 136042192 +197 Current_Pending_Sector 100 100 000 Old_age 0 +198 Offline_Uncorrectable 100 100 000 Old_age 0 +199 UDMA_CRC_Error_Count 200 200 000 Old_age 0 +240 Head_Flying_Hours 100 253 000 Old_age 15859h+53m+20.869s +241 Total_LBAs_Written 100 253 000 Old_age 57609506493 +242 Total_LBAs_Read 100 253 000 Old_age 152392393081 +``` + +### `/dev/sdd` — W4J0LKCD (mirror-1, ONLINE) + +``` +SMART overall-health self-assessment test result: PASSED + +ID# ATTRIBUTE_NAME VALUE WORST THRESH TYPE RAW_VALUE + 1 Raw_Read_Error_Rate 116 090 006 Pre-fail 108217848 + 3 Spin_Up_Time 092 091 000 Pre-fail 0 + 4 Start_Stop_Count 100 100 020 Old_age 310 + 5 Reallocated_Sector_Ct 100 100 010 Pre-fail 0 + 7 Seek_Error_Rate 073 051 030 Pre-fail 185584998742 + 9 Power_On_Hours 048 048 000 Old_age 45580 + 10 Spin_Retry_Count 100 100 097 Pre-fail 0 + 12 Power_Cycle_Count 100 100 020 Old_age 309 +183 Runtime_Bad_Block 100 100 000 Old_age 0 +184 End-to-End_Error 100 100 099 Old_age 0 +187 Reported_Uncorrect 100 100 000 Old_age 0 +188 Command_Timeout 100 099 000 Old_age 8 8 14 +189 High_Fly_Writes 098 098 000 Old_age 2 +190 Airflow_Temperature_Cel 070 050 045 Old_age 30 (Min/Max 29/30) +191 G-Sense_Error_Rate 100 100 000 Old_age 0 +192 Power-Off_Retract_Count 100 100 000 Old_age 0 +193 Load_Cycle_Count 008 008 000 Old_age 184891 +194 Temperature_Celsius 030 050 000 Old_age 30 +195 Hardware_ECC_Recovered 116 100 000 Old_age 108217848 +197 Current_Pending_Sector 100 091 000 Old_age 0 +198 Offline_Uncorrectable 100 091 000 Old_age 0 +199 UDMA_CRC_Error_Count 200 200 000 Old_age 0 +240 Head_Flying_Hours 100 253 000 Old_age 11604h+15m+50.842s +241 Total_LBAs_Written 100 253 000 Old_age 72962800596 +242 Total_LBAs_Read 100 253 000 Old_age 167268621195 +``` + +--- + +## How this audit was collected + +On PVENAS as root: + +```bash +zpool status NAS.SP00 +lsblk -d -o NAME,SIZE,MODEL,SERIAL,ROTA,STATE /dev/sd{a,b,c,d} +for d in sda sdb sdc sdd; do smartctl -i -H -A /dev/$d; done +``` + +Audit timestamp (host local): Thu May 21 22:13:58 2026 EDT. + +--- + +## Next steps + +1. Replace **W4J0L3PY** with a 5 TB+ NAS-class HDD (match ST5000DM000-1FK178 or better). +2. `zpool replace NAS.SP00` with the new disk by-id. +3. Monitor resilver; run `zpool scrub NAS.SP00` after pool is **ONLINE**. +4. Re-run SMART audit after replacement for a clean baseline. diff --git a/docs/guides/security-audit-report.md b/docs/guides/security-audit-report.md new file mode 100644 index 0000000..64ca3e4 --- /dev/null +++ b/docs/guides/security-audit-report.md @@ -0,0 +1,264 @@ +# Security Audit Report + +**Date:** 2026-05-20 +**Auditor:** Automated read-only scan (`scripts/security-audit-*.sh`) +**Scope:** Proxmox nodes `pve201` (10.0.10.201) and `pve10` (10.0.10.10), all LXCs via `pct exec`, SSH deep-dive on hypervisors. + +**Repo baseline** (`roles/ssh/defaults/main.yml`): `PermitRootLogin prohibit-password`, `PasswordAuthentication no`, UFW enabled. + +--- + +## Executive summary + +| Area | Critical | High | Medium | +|------|----------|------|--------| +| Hypervisors (201, 10) | 2 | 4 | 2 | +| LXCs on 201 (10 running) | 0 | 10 | 8 | +| LXCs on 10 (3 running) | 0 | 3 | 3 | + +**Top priorities** + +1. Harden **SSH on both Proxmox hosts** (root + passwords currently allowed). +2. Restrict **Proxmox API/UI port 8006** to admin IPs. +3. Disable **password SSH on all LXCs**; deploy keys + `make copy-ssh-keys` for inventory IPs. +4. Patch hosts with **40–105** pending apt upgrades (hypervisors worst). +5. Put **HTTP services** (8080, 8000, qBit, etc.) behind reverse proxy + TLS or bind to internal IPs. + +--- + +## Proxmox hypervisors + +### pve201 — 10.0.10.201 (`pve`) + +| Resource | Status | +|----------|--------| +| OS | Debian 12, PVE 8.4.16, kernel 6.8.12-18-pve | +| RAM free | ~2.5 GB / 126 GB (**critical**) | +| Pending apt | **105** | +| UFW / fail2ban / unattended-upgrades | **None** | + +#### SSH audit (dedicated) + +| Setting | Current | Target | +|---------|---------|--------| +| `permitrootlogin` | **yes** | `prohibit-password` | +| `passwordauthentication` | **yes** | `no` | +| `pubkeyauthentication` | yes | yes | +| `maxauthtries` | 6 | 3–4 | +| `x11forwarding` | yes | no (on servers) | +| Root keys | 3 keys in `authorized_keys` | audit/remove unused | + +#### Exposed services + +| Port | Service | Risk | +|------|---------|------| +| 22 | SSH | Brute-force (no fail2ban) | +| 8006 | Proxmox API/UI | **Critical** — full cluster control | +| 3128 | spiceproxy | Medium | +| 111 | rpcbind | Low — reduce exposure | + +#### Fixes (pve201) + +```bash +# 1) SSH — prefer Ansible after limiting to your IP +make copy-ssh-key HOST=pve201 # if needed +# Manual quick fix on host: +sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config +sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +sshd -t && systemctl reload sshd + +# 2) Proxmox firewall — Datacenter → Firewall → restrict 8006 to 10.0.10.0/24 or admin IP +# Or iptables on host for port 8006 + +# 3) fail2ban +apt install fail2ban -y +systemctl enable --now fail2ban + +# 4) Auto security updates +apt install unattended-upgrades apt-listchanges -y +dpkg-reconfigure -plow unattended-upgrades + +# 5) Patch +apt update && apt upgrade -y +``` + +**Ansible (when ready):** add `pve201` / `pve10` to a `proxmox` group play with `roles/ssh` + `roles/monitoring_server` (fail2ban). +Do **not** lock yourself out — test with second session first. + +--- + +### pve10 — 10.0.10.10 (`PVENAS`) + +| Resource | Status | +|----------|--------| +| OS | Debian 13 (trixie), PVE, kernel 6.17.13-3-pve | +| Load | **~30** on 24 CPUs (overloaded) | +| Pending apt | **92** | +| UFW / fail2ban / unattended-upgrades | **None** | +| ZFS `NAS.SP00` | **inactive** (I/O suspended) | +| PBS `PVEBUVD00` → 10.0.10.200:8007 | **unreachable** | + +#### SSH audit (dedicated) + +Same as pve201: `permitrootlogin yes`, `passwordauthentication yes`, 3 root authorized_keys. + +#### Exposed services + +| Port | Service | Risk | +|------|---------|------| +| 22 | SSH | High | +| 8006 | Proxmox API/UI | **Critical** | +| 2049, mountd, statd | NFS/RPC | High on LAN | +| 3128 | spiceproxy | Medium | + +#### Fixes (pve10) + +Same SSH / fail2ban / unattended-upgrades / patch steps as pve201. + +Additional: + +```bash +# Investigate ZFS pool +zpool status NAS.SP00 +# Fix PBS connectivity or remove stale datastore from Proxmox UI +``` + +--- + +## LXCs on pve201 (via `pct exec`) + +| VMID | Name | IP | Status | SSH root | Password auth | UFW | fail2ban | Upgrades | Public services | +|------|------|-----|--------|----------|---------------|-----|----------|----------|-----------------| +| 301 | vikunja-debian | 10.0.10.159 | running | without-password | **yes** | no | no | 0 | **3456**, 22 | +| 302 | qbit-debian | 10.0.10.91 | running | without-password | **yes** | no | no | 0 | **8080** (qBit), 22 | +| 303 | searchXNG-debian | 10.0.10.70 | running | without-password | **yes** | no | no | **83** | **8080**, 22 | +| 304 | wireguard-debian | 10.0.10.192 | running | without-password | **yes** | no | no | 0 | 22 | +| 305 | kuma-debian | 10.0.10.197 | **stopped** | — | — | — | — | — | replaced by LXC 218 | +| 306 | portfolio | — | **destroyed** | — | — | — | — | — | migrated → pve10 LXC **219** @ `10.0.10.106` (purged 2026-05-22) | +| 307 | jobber-delian | 10.0.10.178 | running | without-password | **yes** | no | no | **83** | **3005**, 22 | +| 308 | stirling-pdf | 10.0.10.43 | running | without-password | **yes** | no | no | 0 | **8080**, 22 | +| 9001 | pote-dev | 10.0.10.114 | **stopped** | — | — | — | — | — | — | +| 9101 | punimTagFE-dev | 10.0.10.121 | running | without-password | **yes** | **active** | no | **89** | **8000**, 111, 22 | +| 9401 | mirrormatch-dev | 10.0.10.141 | **stopped** | — | — | — | — | — | — | + +**Inventory mapping:** `vikanjans` → 159, `qBittorrent` → 91, `punimTag` app → 121. + +### Common LXC issues (pve201) + +| Issue | Severity | Fix | +|-------|----------|-----| +| `passwordauthentication yes` on all LXCs | High | Set `PasswordAuthentication no` in `/etc/ssh/sshd_config`, reload sshd | +| No fail2ban | High | Install fail2ban or rely on Proxmox FW + LAN segmentation | +| Apps on `0.0.0.0:8080` / 8000 / 3456 | High | Bind to localhost + Caddy, or restrict via Proxmox guest firewall (`firewall=1` on net0 — enable rules) | +| 79–89 pending upgrades on several CTs | Medium | `pct exec -- apt update && apt upgrade -y` | +| Stopped dev CTs (9001, 9401) | Low | Start when needed or keep stopped to reduce attack surface | + +### Per-LXC fixes (pve201) + +```bash +# Example: harden + patch vikunja (301) from Proxmox host +pct exec 301 -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +pct exec 301 -- systemctl reload ssh + +# Patch container +pct exec 303 -- bash -c 'apt update && apt upgrade -y' + +# Copy your SSH key (from Mac, once password/key works) +make copy-ssh-key HOST=vikanjans # 10.0.10.159 +make copy-ssh-key HOST=qBittorrent # 10.0.10.91 +``` + +**punimTagFE-dev (9101):** Only LXC with **UFW active** — extend rules to deny inbound except 22 from admin subnet; still disable password auth. + +--- + +## LXCs on pve10 (via `pct exec`) + +| VMID | Name | IP | Status | SSH root | Password auth | UFW | fail2ban | Upgrades | Public services | +|------|------|-----|--------|----------|---------------|-----|----------|----------|-----------------| +| 210 | cal | 10.0.10.228 | running | without-password | **yes** | no | no | 0 | **3000**, 22 | +| 215 | caseware | 10.0.10.105 | running | without-password | **yes** | no | no | **40** | **80** (nginx), 22 | +| 216 | auto | 10.0.10.59 | running | without-password | **yes** | no | no | **40** | **80** (nginx), 22 | + +**Inventory mapping:** `caseware` → 105, `auto` → 59. + +### Fixes (pve10 LXCs) + +```bash +# SSH harden caseware (215) +pct exec 215 -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +pct exec 215 -- systemctl reload sshd + +# Patch +pct exec 215 -- apt update && apt upgrade -y +pct exec 216 -- apt update && apt upgrade -y + +# Deploy keys from Mac +make copy-ssh-key HOST=caseware +make copy-ssh-key HOST=auto +``` + +**HTTP port 80 on caseware/auto:** Ensure TLS termination on Caddy (inventory host `caddy` 10.0.10.50) and no plain HTTP from WAN if exposed. + +--- + +## SSH hardening checklist (all Linux targets) + +Use this order to avoid lockout: + +1. Confirm your key works: `ssh -o BatchMode=yes root@ true` +2. Set `PasswordAuthentication no` +3. Set `PermitRootLogin prohibit-password` (LXCs already `without-password` — equivalent for keys-only) +4. `sshd -t && systemctl reload sshd` +5. Open **second terminal** and test before closing first +6. Optional: change SSH port, `MaxAuthTries 4`, disable `X11Forwarding` + +**Ansible alignment:** + +```bash +# After keys on host +make dev HOST= --tags security +# or role ssh via playbooks that include roles/ssh +``` + +--- + +## Re-run audits + +```bash +# Hypervisor full audit +ssh root@10.0.10.201 'bash -s' < scripts/security-audit-remote.sh +ssh root@10.0.10.10 'bash -s' < scripts/security-audit-remote.sh + +# Hypervisor SSH-only +ssh root@10.0.10.201 'bash -s' < scripts/security-audit-ssh.sh + +# All LXCs on a node +ssh root@10.0.10.201 'bash -s' < scripts/security-audit-lxc-via-pve.sh +ssh root@10.0.10.10 'bash -s' < scripts/security-audit-lxc-via-pve.sh +``` + +--- + +## Tracking + +| Item | Owner | Status | +|------|-------|--------| +| SSH harden pve201 | | ☐ | +| SSH harden pve10 | | ☐ | +| Restrict 8006 on both nodes | | ☐ | +| fail2ban on hypervisors | | ☐ | +| Patch pve201 / pve10 | | ☐ | +| Disable password SSH on all LXCs | | ☐ | +| `copy-ssh-keys` for inventory | | ☐ | +| TLS for :80/:8080 services | | ☐ | +| Fix ZFS NAS.SP00 on pve10 | | ☐ | + +--- + +## References + +- **[Security remediation plan](security-remediation-plan.md)** — phased fixes (critical → low) and login model +- [Security hardening guide](security.md) +- [SECURITY_HARDENING_PLAN.md](../SECURITY_HARDENING_PLAN.md) +- Role defaults: `roles/ssh/defaults/main.yml` diff --git a/docs/guides/security-remediation-plan.md b/docs/guides/security-remediation-plan.md new file mode 100644 index 0000000..81f8bc1 --- /dev/null +++ b/docs/guides/security-remediation-plan.md @@ -0,0 +1,459 @@ +# Security Remediation Plan + +**Based on:** [security-audit-report.md](security-audit-report.md) (2026-05-20) +**Goal:** Align hosts with `roles/ssh` (keys only, no password SSH) without locking yourself out. + +--- + +## How you should log in (not “ladmin → root” everywhere) + +Your inventory uses **different users on purpose**. After hardening, the pattern is: + +| Host type | Inventory user | How you work | Root access | +|-----------|----------------|--------------|-------------| +| **Proxmox** (`pve201`, `pve10`) | `root` | `ssh root@10.0.10.201` with **your SSH key** | Direct root (keys only, no password) | +| **Dev / QA** (`dev01`, `git-ci-01`, …) | `ladmin` (or `beast`, `master`) | `ssh ladmin@host` with **key** | `sudo` for admin tasks; Ansible `become: true` | +| **Services** (caddy, jellyfin, …) | often `root` | `ssh root@host` with **key** | Direct root (keys only) | +| **Optional bootstrap** | — | `make bootstrap-root-ssh HOST=x` | One-time: key on `ladmin` → `su` to install **root** key → then harden SSH | + +**You do not need** “SSH ladmin then su root” on Proxmox if you keep managing them as `root` in inventory — you need **root + SSH key + passwords disabled**. + +**You do** use ladmin → sudo on dev/qa boxes where `ansible_user=ladmin`. That is normal: unprivileged (or sudo) login + elevation, not password guessing on root. + +**`PermitRootLogin prohibit-password`** means: root may log in **only with a key**, never with a password. It does **not** mean “ban root; use ladmin only.” + +**`PasswordAuthentication no`** means: **nobody** (root, ladmin, etc.) can SSH with a password — keys only. + +--- + +## Phases overview + +| Phase | When | Focus | +|-------|------|--------| +| **0 — Backup + prep** | Before any change | Snapshots, `sshd` copies, git commit, keys, second SSH session | +| **1 — Critical** | Week 1 | Proxmox SSH + 8006, keys everywhere, RAM on 201 | +| **2 — High** | Week 1–2 | LXCs SSH, fail2ban, patching, app ports | +| **3 — Medium** | Week 2–4 | unattended-upgrades, Ansible `make security`, TLS | +| **4 — Low** | Ongoing | rpcbind, naming, stopped CTs, Mac, docs | + +--- + +## Phase 0 — Backup (before any hardening) + +**Yes — back up first.** SSH and firewall mistakes can lock you out; patches can break services. Use the right backup type per layer. + +### What to back up (by layer) + +| Layer | What | Method | Rollback if SSH breaks | +|-------|------|--------|-------------------------| +| **Your Mac** | Ansible repo + `~/.ansible-vault-pass` (secure copy) + SSH keys | Time Machine / git commit / copy `~/.ssh` | N/A | +| **Proxmox hosts** | `/etc/ssh/sshd_config`, `/etc/pve/`, firewall rules | Copy files + **Proxmox snapshot** optional | **Console** in web UI (`pct enter` / VM console) | +| **Each LXC/VM** | Full guest state | **Proxmox snapshot** or `vzdump` | Restore snapshot or rollback CT | +| **Dev workstations** | OS + home (if Timeshift installed) | `make timeshift-snapshot HOST=dev02` | `make timeshift-restore` | +| **Central PBS** | — | **Not reliable today** — `10.0.10.200` unreachable | Fix PBS later; don’t depend on it for this work | + +### 0A — Mac / repo (5 minutes) + +```bash +cd ~/Documents/code/ansible +git status +git add -A && git commit -m "Pre-security-hardening baseline" # if you want a restore point + +# Store vault passphrase somewhere safe (password manager), NOT only on disk +# Optional: encrypted copy of ~/.ansible-vault-pass offline +``` + +### 0B — Proxmox: config files (both nodes) + +```bash +for pve in 10.0.10.201 10.0.10.10; do + ssh root@$pve "mkdir -p /root/pre-hardening-$(date +%Y%m%d) && \ + cp -a /etc/ssh/sshd_config /root/pre-hardening-$(date +%Y%m%d)/ && \ + cp -a /etc/pve /root/pre-hardening-$(date +%Y%m%d)/pve-etc 2>/dev/null; \ + ls -la /root/pre-hardening-$(date +%Y%m%d)/" +done +``` + +### 0C — Proxmox: snapshots (recommended before SSH/firewall on PVE) + +**Running LXCs on pve201** (from audit): 301–308, 9101 — snapshot each before `pct exec` SSH changes. + +**Running LXCs on pve10:** 210, 215, 216. + +```bash +# On pve201 — snapshot (fast, local-lvm; needs free space) +ssh root@10.0.10.201 'for id in 301 302 303 304 305 306 307 308 9101; do + name=$(pct list | awk -v i=$id "$1==i {print \$4}") + echo "Snapshot vmid=$id ($name)" + pct snapshot $id pre-ssh-hardening-$(date +%Y%m%d) || echo "FAILED $id" +done' + +# On pve10 +ssh root@10.0.10.10 'for id in 210 215 216; do + pct snapshot $id pre-ssh-hardening-$(date +%Y%m%d) || echo "FAILED $id" +done' +``` + +**Optional full backup** (slower, larger) — important CTs only if snapshots fail (low disk on 201): + +```bash +vzdump --storage local --mode snapshot --compress zstd +``` + +**Check space on pve201 first** (~2.5 GB RAM + disk — snapshot needs free space on `local-lvm`): + +```bash +ssh root@10.0.10.201 'pvesm status; free -h' +``` + +If snapshots fail for lack of space: do **0B only** on PVE, then harden SSH using **Proxmox console** as safety net (no snapshot). + +### 0D — Inventory VMs with Timeshift (`dev` group) + +Only where Timeshift is already installed (e.g. `dev02`): + +```bash +make timeshift-snapshot HOST=dev02 +make timeshift-list HOST=dev02 +``` + +Not used on Proxmox or most LXCs by default. + +### 0E — Export current SSH settings (audit trail) + +```bash +mkdir -p ~/security-hardening-backup-$(date +%Y%m%d) +ssh root@10.0.10.201 'bash -s' < scripts/security-audit-ssh.sh > ~/security-hardening-backup-$(date +%Y%m%d)/pve201-ssh.txt +ssh root@10.0.10.10 'bash -s' < scripts/security-audit-ssh.sh > ~/security-hardening-backup-$(date +%Y%m%d)/pve10-ssh.txt +ssh root@10.0.10.201 'bash -s' < scripts/security-audit-lxc-via-pve.sh > ~/security-hardening-backup-$(date +%Y%m%d)/pve201-lxc.txt +``` + +### Backup exit criteria (do not skip) + +- [ ] Git commit (or branch) for ansible repo +- [ ] `sshd_config` (+ optional `/etc/pve`) copied on **both** PVE nodes +- [ ] Proxmox snapshots **or** documented reason skipped (disk/RAM) +- [ ] Second SSH session tested to `pve201` / `pve10` +- [ ] You know how to open **Proxmox → VM/CT → Console** if SSH fails + +### Rollback quick reference + +| Problem | Rollback | +|---------|----------| +| Bad `sshd_config` on PVE | Console → restore `/root/pre-hardening-*/sshd_config` → `systemctl reload sshd` | +| Bad LXC SSH | `pct rollback pre-ssh-hardening-YYYYMMDD` | +| Bad patch on CT | Same snapshot rollback | +| Locked out of LAN on 8006 | Console → disable/datacenter firewall rule | + +--- + +## Phase 0 — Prep (after backups) + +| # | Task | Command / notes | +|---|------|----------------| +| 0.1 | Confirm vault password file | `~/.ansible-vault-pass` | +| 0.2 | Bootstrap control node | `make bootstrap` | +| 0.3 | Verify key on Proxmox | `ssh -o BatchMode=yes root@10.0.10.201 true` | +| 0.4 | Copy keys to inventory | `make copy-ssh-keys` (or per group) | +| 0.5 | Document admin IP | e.g. `10.0.10.127` for firewall rules | +| 0.6 | Open **second terminal** before changing `sshd` | Test login before closing first session | + +**Exit criteria:** Backups done (above) + key login works to `pve201`, `pve10`, and hosts you will harden next. + +--- + +## Phase 1 — Critical + +### 1.1 Proxmox SSH (pve201 + pve10) + +**Issue:** `PermitRootLogin yes` + `PasswordAuthentication yes` — password brute force on root. + +**Fix (per host, after 0.3):** + +```bash +# On pve201 OR pve10 — keep existing session open! +sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config +sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +sshd -t && systemctl reload sshd +``` + +**Verify (new terminal):** `ssh -o BatchMode=yes root@10.0.10.201 true` + +**Ansible (later):** dedicated play for `[proxmox]` with `roles/ssh` (today `make security` only targets `dev` playbook). + +| Host | Priority | +|------|----------| +| pve201 | P0 | +| pve10 | P0 | + +--- + +### 1.2 Restrict Proxmox UI/API (port 8006) + +**Issue:** Anyone on LAN can hit full cluster API. + +**Fix (choose one):** + +- **A — Proxmox firewall (recommended):** Datacenter → Firewall → add rule: accept `8006` from `10.0.10.0/24` and/or your Mac IP; drop others. +- **B — SSH tunnel only:** no LAN exposure; `ssh -L 8006:127.0.0.1:8006 root@10.0.10.201` → browser `https://127.0.0.1:8006`. + +**Do not** block 8006 globally without A or B in place. + +--- + +### 1.3 RAM on pve201 (~2.5 GB free) + +**Issue:** New guests or updates risk OOM. + +**Fix:** + +```bash +ssh root@10.0.10.201 'free -h; pct list' +# Stop non-essential CTs/VMs or migrate workload to pve10 +``` + +Review running guests from `make proxmox-info ALL=true`; stop labs you do not need. + +--- + +### 1.4 Deploy SSH keys to unreachable inventory hosts + +**Issue:** Cannot audit or Ansible-manage hosts without keys. + +**Order:** + +1. `make copy-ssh-key HOST=caddy` (and each `[services]` host) +2. `make bootstrap-root-ssh HOST=listmonk` where root password still works but key does not +3. `make copy-ssh-keys GROUP=qa` for `ladmin` hosts + +**Exit criteria:** `make ping` succeeds for each group you will harden in phase 2. + +--- + +## Phase 2 — High + +### 2.1 LXC SSH — disable password auth (all running CTs) + +**Issue:** `passwordauthentication yes` on every audited LXC. + +**Fix from Proxmox host (no Mac SSH to CT required):** + +```bash +# pve201 — example for each running VMID +for id in 301 302 303 304 305 306 307 308 9101; do + pct exec $id -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config + pct exec $id -- bash -c 'sshd -t && systemctl reload sshd' || pct exec $id -- systemctl reload ssh +done + +# pve10 +for id in 210 215 216; do + pct exec $id -- sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config + pct exec $id -- systemctl reload sshd +done +``` + +**Before disable:** install your key on CTs you need (`make copy-ssh-key HOST=vikanjans`, etc.). + +**Note:** CTs already have `permitrootlogin without-password` — keep that; only turn off passwords. + +--- + +### 2.2 fail2ban on hypervisors + +**Issue:** No brute-force protection on SSH (and eventually 8006 if proxied). + +```bash +ssh root@10.0.10.201 'apt install -y fail2ban && systemctl enable --now fail2ban' +ssh root@10.0.10.10 'apt install -y fail2ban && systemctl enable --now fail2ban' +``` + +Optional: extend to high-value LXCs via `roles/monitoring_server` or manual install. + +--- + +### 2.3 Patch backlog + +| Target | Pending | Action | +|--------|---------|--------| +| pve201 | ~105 | `apt update && apt upgrade -y` (maintenance window) | +| pve10 | ~92 | same | +| LXCs 303, 306, 307, 9101 | 79–89 | `pct exec -- apt update && apt upgrade -y` | +| caseware, auto (pve10) | ~40 | same | + +**Order:** hypervisors first (after snapshot), then LXCs one by one. + +--- + +### 2.4 Application ports on `0.0.0.0` + +**Issue:** HTTP services exposed on LAN without TLS/auth. + +| LXC / host | Port | Fix | +|------------|------|-----| +| qbit (91) | 8080 | Prefer VPN; or Caddy + auth; bind to internal IP | +| searchXNG (70) | 8080 | Same | +| punimTagFE (121) | 8000 | Behind Caddy; firewall allow only 10.0.10.0/24 | +| vaultwarden (142) | 8080 | Already in inventory — reverse proxy + TLS | +| portfolio | **106:80** (pve10 LXC 219, nginx) | Migrated 2026-05-22; pve201 LXC **306 destroyed** | +| vikunja (159) | 3456 | Proxy via Caddy (`todo.levkin.ca`) | + +**Pattern:** App listens `127.0.0.1` only; **Caddy** (`10.0.10.50`) terminates TLS for public URLs in inventory. + +--- + +### 2.5 pve10 infrastructure + +| Issue | Fix | +|-------|-----| +| ZFS `NAS.SP00` suspended | `zpool status`; import/clear errors | +| PBS 10.0.10.200 unreachable | Fix network/service or remove stale datastore | +| Load ~30 | Identify heavy VMs; migrate or stop | + +--- + +## Phase 3 — Medium + +### 3.1 unattended-upgrades + +Hypervisors + important LXCs: + +```bash +apt install -y unattended-upgrades apt-listchanges +dpkg-reconfigure -plow unattended-upgrades +``` + +### 3.2 Ansible security roles (by group) + +Today `make security` runs `playbooks/development.yml` on **`dev` only**. + +**Expand with new/changed playbooks:** + +| Group | Playbook idea | Roles | +|-------|---------------|-------| +| `[proxmox]` | `playbooks/infrastructure/proxmox-hardening.yml` | `ssh`, monitoring_server | +| `[services]` | extend `playbooks/servers.yml` | `ssh`, `base`, fail2ban | +| `[qa]` | tag run on qa hosts | `ssh` | +| LXCs | optional `pct` + Ansible over SSH after keys | `ssh` | + +**Workflow:** + +```bash +make check HOST=pve201 # after proxmox play exists +make dev HOST=dev01 --tags security +``` + +### 3.3 UFW on LXCs + +Only **punimTagFE-dev** has UFW today. Template for others: + +- Allow 22 from `10.0.10.0/24` +- Allow app port only if needed on LAN +- Default deny incoming + +Use `roles/ssh` UFW tasks or Proxmox guest firewall (`firewall=1` on `net0`). + +### 3.4 Align names / inventory + +| Proxmox name | Ansible | Action | +|--------------|---------|--------| +| punimTagFE-dev | punimTag-dev | Rename CT or update `app_projects` name | +| vikunja-debian | vikanjans | OK (IP 159) | +| qbit-debian | qBittorrent | OK (IP 91) | + +### 3.5 Mac (control machine) + +| Issue | Fix | +|-------|-----| +| Firewall off | System Settings → Firewall → On | +| FileVault off | Enable FileVault | +| Docker on `*:3000` | Bind to `127.0.0.1` unless LAN needed | + +--- + +## Phase 4 — Low + +| Item | Fix | +|------|-----| +| rpcbind (111) on pve201 / 9101 | Disable if unused: `systemctl disable rpcbind` | +| X11Forwarding on Proxmox | Set `no` in sshd | +| Stopped CTs 9001, 9401 | Leave stopped or destroy if unused | +| `make security-audit` target | Add Makefile → runs audit scripts, appends to report | +| Quarterly re-audit | Re-run `scripts/security-audit-lxc-via-pve.sh` | + +--- + +## Suggested calendar + +| Week | Critical | High | Medium | +|------|----------|------|--------| +| **1** | 0.x prep, 1.1 SSH both PVE, 1.2 firewall 8006, 1.4 keys | 2.1 LXC passwords off (after keys), 2.2 fail2ban | — | +| **2** | 1.3 RAM 201 | 2.3 patch PVE + LXCs, 2.4 Caddy for 8080 services | 3.1 unattended-upgrades | +| **3** | — | 2.5 pve10 ZFS/PBS/load | 3.2 Ansible plays for proxmox + services | +| **4** | — | — | 3.3 UFW, 3.4 naming, 3.5 Mac | + +--- + +## Rollback (if locked out of SSH) + +- Proxmox: use **console** in web UI (or physical/IPMI) → edit `/etc/ssh/sshd_config` → `PasswordAuthentication yes` temporarily → reload sshd. +- LXC: `pct enter ` from PVE host. + +--- + +## Tracking checklist + +Copy into your issue tracker or tick in [security-audit-report.md](security-audit-report.md): + +**Backup (Phase 0 — before everything)** + +- [ ] Git commit / branch for ansible repo +- [ ] PVE `sshd_config` backup on 201 + 10 +- [ ] Proxmox CT snapshots (or vzdump) on critical LXCs +- [ ] Audit outputs saved locally (`security-hardening-backup-*`) +- [ ] Console access tested in Proxmox UI + +### Critical + +- [ ] pve201 SSH: prohibit-password + no passwords +- [ ] pve10 SSH: same +- [ ] 8006 restricted to admin subnet/IP +- [ ] SSH keys on all inventory hosts +- [ ] pve201 RAM relieved + +### High + +- [ ] All running LXCs: PasswordAuthentication no +- [ ] fail2ban on pve201 + pve10 +- [ ] Patch pve201, pve10, LXCs with 40+ upgrades +- [ ] qBit / searchXNG / punimTag / vaultwarden port exposure reduced +- [ ] pve10 ZFS + PBS investigated + +### Medium + +- [ ] unattended-upgrades on PVE + key LXCs +- [ ] `make security` (or new plays) for proxmox, services, qa +- [ ] UFW on critical LXCs +- [ ] Mac firewall + FileVault + +### Low + +- [ ] rpcbind, X11, audit Makefile, naming cleanup + +--- + +## Quick reference: your login after plan + +```bash +# Proxmox +ssh root@10.0.10.201 # key only + +# Dev / QA +ssh ladmin@10.0.10.223 # key only → sudo -i when you need root + +# Services (inventory root) +ssh root@10.0.10.50 # key only + +# Proxmox UI (if 8006 restricted) +ssh -L 8006:127.0.0.1:8006 root@10.0.10.201 +# → https://127.0.0.1:8006 +``` diff --git a/docs/guides/site-lxc-git.md b/docs/guides/site-lxc-git.md new file mode 100644 index 0000000..9e67812 --- /dev/null +++ b/docs/guides/site-lxc-git.md @@ -0,0 +1,87 @@ +# Site LXCs — git deploy (levkin / caseware / auto / portfolio) + +## Remotes (correct) + +Use **`git.levkin.ca`**, not `10.0.30.169`: + +``` +git@git.levkin.ca:ilia/levkin.ca.git +git@git.levkin.ca:ilia/caseware.git +git@git.levkin.ca:ilia/auto.git +git@git.levkin.ca:ilia/sdetProfile.git +``` + +Gitea VM is **`10.0.10.169`** on pve10. Public `git.levkin.ca:22` hits your home IP and is **closed**; git SSH uses LAN IP via `~/.ssh/config`. + +## SSH config (on site LXC, as root) + +```ssh +# /root/.ssh/config +Host git.levkin.ca + HostName 10.0.10.169 + User git + IdentityFile ~/.ssh/id_ed25519 + StrictHostKeyChecking accept-new +``` + +## Deploy keys + +Each LXC should use its **own** deploy key in Gitea (**Repo → Settings → Deploy Keys**). +Gitea allows a public key only **once per server** — if you see *“already been added to the server”*, generate a repo-specific key: + +```bash +# On portfolio LXC 219 (via pve10) +pct exec 219 -- cat /root/.ssh/id_ed25519_gitea.pub +``` + +Portfolio uses `~/.ssh/id_ed25519_gitea` in `/root/.ssh/config` for `Host git.levkin.ca` (`IdentitiesOnly yes`). + +| LXC | Repo | Key file / comment | +|-----|------|---------------------| +| 215 | caseware | `~/.ssh/id_ed25519` → `root@caseware` | +| 216 | auto | `~/.ssh/id_ed25519` → `root@auto` | +| 219 | sdetProfile | `~/.ssh/id_ed25519_gitea` → `deploy-portfolio-sdetProfile` | +| 220 | levkin.ca | `~/.ssh/id_ed25519_gitea` → `deploy-levkin-levkin.ca` (add in Gitea UI) or HTTPS clone with read token | + +## levkin.ca routes (LXC 220) + +| Public URL | Served from | +|------------|-------------| +| `https://levkin.ca/` | `www/index.html` (spec) | +| `https://levkin.ca/folders/` | `www/folders/` (stack-folder) | + +Build before push: + +```bash +cd ~/Documents/code/levkin.ca +npm run build:www +git add www/ && git commit -m "Rebuild www" && git push +``` + +On LXC: + +```bash +pct exec 220 -- bash -c 'cd /var/www/levkin && git pull origin main' +``` + +## Push / pull + +```bash +# On LXC (via pve10) +pct exec 215 -- bash -c 'cd /var/www/caseware && git pull origin main && git push origin main' +pct exec 216 -- bash -c 'cd /var/www/auto && git pull origin master && git push origin master' +pct exec 219 -- bash -c 'cd /var/www/portfolio && git pull origin master && git push origin master' +pct exec 220 -- bash -c 'cd /var/www/levkin && git pull origin main' +``` + +After editing `index.html`, commit on the LXC, push, then hard-refresh the public site. + +## Gitea VM SSH (git@10.0.10.169) + +If deploy keys fail after adding them in the UI: + +1. Keys live in `/var/lib/gitea/.ssh/authorized_keys` (regenerated by Gitea). +2. OpenSSH logs in as user **`git`** — copy/sync that file to **`/home/git/.ssh/authorized_keys`** (`chown git:git`, mode `600`). +3. `command=` must run **`gitea serv`** as user **`gitea`** (e.g. `sudo -n -E -u gitea /usr/bin/gitea …`) with `SSH_ORIGINAL_COMMAND` preserved in sudoers. + +Portfolio uses repo path **`ilia/sdetprofile`** (lowercase on disk). diff --git a/docs/guides/unifi-static-dhcp.md b/docs/guides/unifi-static-dhcp.md new file mode 100644 index 0000000..8371492 --- /dev/null +++ b/docs/guides/unifi-static-dhcp.md @@ -0,0 +1,97 @@ +# UniFi static DHCP (10.0.10.x homelab) + +**Controller:** https://192.168.2.1/ +**Goal:** Pin Proxmox VM MAC addresses to stable `10.0.10.x` addresses so Caddy and Ansible inventory do not drift. + +LXCs on pve10 (**210, 215–220**) are already static via `pct set` — **no UniFi lease needed** for those rows. +This guide is for **VMs** (and pve201 guests) that still use DHCP. + +--- + +## Before you start + +1. Confirm guests get addresses on **`10.0.10.0/24`** (not only `192.168.2.x`). In UniFi, open the network that faces Proxmox `vmbr0`. +2. Gateway for homelab guests should be **`10.0.10.1`** (or your router’s IP on that VLAN). +3. Use the MAC table in [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md). + +--- + +## Method A — From a connected client (easiest) + +1. Open **https://192.168.2.1/** and sign in. +2. Go to **Clients** (or **UniFi Devices** → **Clients**). +3. Find the device (hostname like `gitea`, `vaultwarden`, or MAC from Proxmox `qm config `). +4. Click the client → **Settings** (gear) or **⋮**. +5. Enable **Fixed IP** / **Use fixed IP address**. +6. Set IP to the target from the table (e.g. `10.0.10.169` for gitea). +7. **Apply** / **Save**. +8. On the VM: renew DHCP or reboot: + ```bash + sudo dhclient -r && sudo dhclient + # or: reboot + ``` +9. Verify: `ip -4 addr show` shows the reserved IP. + +--- + +## Method B — DHCP static mapping (manual MAC) + +1. **Settings** → **Networks**. +2. Open the LAN/VLAN that serves **10.0.10.x** (name varies: `Default`, `Homelab`, `10.0.10`). +3. **DHCP** section → **DHCP Static IP** / **Static leases** → **Create new**. +4. Enter: + - **MAC address** (from Proxmox, e.g. `BC:24:11:E9:BD:E5`) + - **IP address** (e.g. `10.0.10.169`) + - **Name** (optional, e.g. `giteaVM`) +5. Save. Repeat for each row in the reservations table. +6. Renew DHCP on each VM or reboot. + +--- + +## Already static (skip UniFi DHCP) + +| VMID | Name | IP | How | +|------|------|-----|-----| +| 210 | cal | 10.0.10.228 | `pct set` | +| 215 | caseware | 10.0.10.105 | `pct set` | +| 216 | auto | 10.0.10.59 | `pct set` | +| 217 | identity | 10.0.10.21 | `pct set` | +| 218 | monitoring | 10.0.10.22 | `pct set` | +| 219 | portfolio | 10.0.10.106 | `pct set` (`iliadobkin.com`) | +| 220 | levkin | 10.0.10.60 | `pct set` (`levkin.ca`) | +| 106 | caddy | 10.0.10.50 | static in `/etc/network/interfaces` | + +--- + +## Priority order — UniFi reservations (VMs / pve201) + +| Order | Guest | IP | MAC | Notes | +| ----- | ----- | --- | --- | ----- | +| 1 | giteaVM | 10.0.10.169 | BC:24:11:E9:BD:E5 | | +| 2 | vaultwardenVM | 10.0.10.142 | BC:24:11:58:DB:DC | | +| 3 | n8n (WRA) | 10.0.10.154 | BC:24:11:61:DE:7A | | +| 4 | hermes | 10.0.10.36 | BC:24:11:51:1E:99 | | +| 5 | actual | 10.0.10.158 | BC:24:11:10:7B:64 | | +| 6 | jellyfin | 10.0.10.232 | BC:24:11:29:B8:84 | stopped until NAS OK | +| 7 | listmonk (pve201 VM 113) | 10.0.10.148 | BC:24:11:11:53:9A | | +| 8 | Mailcow (pve201) | 10.0.10.132 | BC:24:11:34:75:2D | | +| 9 | TrueNAS | 10.0.10.107 | BC:24:11:14:DE:B5 | optional pin | +| 10 | PVE.BU.SVR | 10.0.10.200 | BC:24:11:DA:95:3B | lab VM | + +Full MAC table: [vm-static-ip-router-reservations.md](vm-static-ip-router-reservations.md). + +--- + +## If you only see 192.168.2.x in UniFi + +Your Mac may be on `192.168.2.0/24` while Proxmox guests use a separate **`10.0.10.0/24`** network. In that case: + +- Add or edit a UniFi network/VLAN for `10.0.10.0/24`, or +- Ensure the router bridges/routes between `192.168.2.x` and `10.0.10.x`, and +- Put DHCP reservations on the network that actually serves the Proxmox bridge. + +--- + +## After reservations + +Mark `✅ router` in [host-list.md](host-list.md) for each guest. diff --git a/docs/guides/vm-static-ip-router-reservations.md b/docs/guides/vm-static-ip-router-reservations.md new file mode 100644 index 0000000..01d579e --- /dev/null +++ b/docs/guides/vm-static-ip-router-reservations.md @@ -0,0 +1,38 @@ +# VM static IPs — router DHCP reservations (pve10) + +Proxmox **LXCs** use `pct set … ip=10.0.10.X/24` (done for 210, 215–219). + +**VMs** without cloud-init are pinned by **router DHCP reservation by MAC** (Method B in plan-2). +Ansible **cannot log into your router** — configure static leases in the UI. + +**Your UniFi:** https://192.168.2.1/ — step-by-step: [unifi-static-dhcp.md](unifi-static-dhcp.md). + +Homelab guests use **`10.0.10.0/24`** (gateway `10.0.10.1`). If UniFi also serves `192.168.2.x`, ensure the `10.0.10.x` segment is the network those VMs/LXCs use +(or that routing/DHCP relay matches your Proxmox bridge). + +## How to add a reservation (any router) + +1. Open router admin (UniFi: **https://192.168.2.1/**). +2. Find **DHCP** / **LAN** / **Static leases** / **Reserved addresses**. +3. For each row: **MAC address** → **IP address** → Save. +4. Reboot guest or renew DHCP (`dhclient -r && dhclient` on Debian) if IP does not update immediately. +5. Mark done in [host-list.md](host-list.md). + +| VMID | Name | MAC | Reserve IP | Inventory | +| ---- | ---- | --- | ---------- | --------- | +| 102 | gitea-alpine | `BC:24:11:E9:BD:E5` | `10.0.10.169` | giteaVM | +| 103 | WRA / n8n | `BC:24:11:61:DE:7A` | `10.0.10.154` | n8n | +| 104 | vaultwarden | `BC:24:11:58:DB:DC` | `10.0.10.142` | vaultwardenVM | +| 105 | TrueNAS | `BC:24:11:14:DE:B5` | `10.0.10.107` | — | +| 106 | caddy | `BC:24:11:E0:49:B4` | `10.0.10.50` | ✅ static in-guest | +| 108 | actual | `BC:24:11:10:7B:64` | `10.0.10.158` | actual | +| 117 | hermes | `BC:24:11:51:1E:99` | `10.0.10.36` | hermes (guest agent on) | +| 200 | PVE.BU.SVR | `BC:24:11:DA:95:3B` | `10.0.10.200` | — | +| 201 | NextcloudAIO | `BC:24:11:14:D4:DE` | `10.0.10.24` | **decommission** — skip new work | +| 101 | Jellyfin | `BC:24:11:29:B8:84` | `10.0.10.232` | stopped | +| 113 | listmonk (pve201) | `BC:24:11:11:53:9A` | `10.0.10.148` | listmonk | +| — | Mailcow (pve201 VM 106) | `BC:24:11:34:75:2D` | `10.0.10.132` | mailcow (inventory) | + +After reserving in the router, mark **DHCP/Static** as `✅ router` in [host-list.md](host-list.md). + +In-guest static (optional, stronger): SSH as root and set `/etc/network/interfaces` like caddy VM 106. diff --git a/inventories/production/group_vars/all/mailcow.yml b/inventories/production/group_vars/all/mailcow.yml new file mode 100644 index 0000000..158ad17 --- /dev/null +++ b/inventories/production/group_vars/all/mailcow.yml @@ -0,0 +1,15 @@ +--- +# Mailcow mailbox definitions (passwords live in vault only). +# Create: make mailcow-mailbox MAILBOX= +# Add a new key under mailcow_mailboxes + vault_mailcow_mailbox_passwords. +mailcow_url: "https://mail.levkine.ca" +mailcow_domain: "levkine.ca" + +mailcow_mailboxes: + alerts: + local_part: alerts + name: Monitoring Alerts + quota: 1024 + vault_password_key: alerts + +mailcow_api_key: "{{ vault_mailcow_api_key | default('') }}" diff --git a/inventories/production/group_vars/all/main.yml b/inventories/production/group_vars/all/main.yml index ffc80db..a9e098e 100644 --- a/inventories/production/group_vars/all/main.yml +++ b/inventories/production/group_vars/all/main.yml @@ -26,6 +26,10 @@ maintenance_pre_reboot_delay: 5 # Delay before reboot in seconds # Default Tailscale settings - these tell the playbook to use your vault key tailscale_auth_key: "{{ vault_tailscale_auth_key | default('') }}" + +# Mailcow — API key + per-mailbox passwords in vault; definitions in group_vars/all/mailcow.yml +mailcow_api_key: "{{ vault_mailcow_api_key | default('') }}" +mailcow_mailbox_passwords: "{{ vault_mailcow_mailbox_passwords | default({}) }}" tailscale_accept_routes: true tailscale_accept_dns: true tailscale_ssh: false diff --git a/inventories/production/group_vars/all/vault.example.yml b/inventories/production/group_vars/all/vault.example.yml index b996499..a2c941a 100644 --- a/inventories/production/group_vars/all/vault.example.yml +++ b/inventories/production/group_vars/all/vault.example.yml @@ -22,6 +22,33 @@ vault_ssh_public_key: "ssh-ed25519 AAAA... you@example" # LXC create bootstrap password (often required by Proxmox) vault_lxc_root_password: "CHANGE_ME" +# Mailcow API — System → Configuration → Access → API (read/write) +vault_mailcow_api_key: "CHANGE_ME" +# Per-mailbox passwords (make mailcow-mailbox MAILBOX=) +vault_mailcow_mailbox_passwords: + alerts: "CHANGE_ME" +# Legacy alias (optional) +vault_alerts_mailbox_password: "CHANGE_ME" + +# Uptime Kuma + SMTP (monitoring LXC) +vault_uptime_kuma_url: "http://10.0.10.22:3001" +vault_uptime_kuma_user: "admin" +vault_uptime_kuma_password: "CHANGE_ME" +vault_kuma_smtp_host: "mail.levkine.ca" +vault_kuma_smtp_port: "587" +vault_kuma_smtp_user: "alerts@levkine.ca" +vault_kuma_smtp_password: "CHANGE_ME" +vault_kuma_smtp_to: "idobkin@gmail.com" + +# Umami (monitoring LXC /opt/monitoring/.env) +vault_umami_db_password: "CHANGE_ME" +vault_umami_app_secret: "CHANGE_ME" + +# Hermes Mattermost (not Telegram) +vault_mattermost_url: "https://slack.levkin.ca" +vault_mattermost_token: "CHANGE_ME" +vault_mattermost_allowed_users: "CHANGE_ME" + # ----------------------------------------------------------------------------- # POTE (python/venv + cron) secrets # ----------------------------------------------------------------------------- diff --git a/inventories/production/group_vars/all/vault.yml b/inventories/production/group_vars/all/vault.yml index c96d344..adc1b49 100644 --- a/inventories/production/group_vars/all/vault.yml +++ b/inventories/production/group_vars/all/vault.yml @@ -1,100 +1,125 @@ $ANSIBLE_VAULT;1.1;AES256 -38316537376634623462313731323238666165383731656632373665653534623163386333303865 -3865383030316132663831303932376437346335323233630a643331663539383163306666393764 -38313265656561343839616565343663353037663237663032366632373831363336306632626266 -3361643865333533340a356233663034343932323831323236356161396237346532323838373135 -33393239313730363336613338373039663735323431323562613363343863326234633833663631 -66343462623231663932633537373361313764393630356666393662653135356139663935613038 -65383261363065633235343031346535373564373931373063386265343335623265653739613830 -32656233393330633362623932316431383761306332393466313936396533333839313831663331 -34353864356336303331663233653666363966376162303731626134313235306238323363303439 -32333039653235326632303637303065386161616138356463623561366637376366326262303166 -38323763393934666539373063323265333961666164613437316164633565393035626538353365 -33386562336665383863636639643232623161643933313664396534383362303838663362653736 -64393334616165336638306235363734653431646431616139373336656333623963386538646230 -39663230363063386231343730663162313463666135323265613261626637626332353534396535 -31623664363766646332396336396133613662643232366433323330373962633839613635333763 -63306230623438346639323863353137363330316630316130326134323731326635643736373736 -62336362656265633233623165376436373231656666303832373966353732313031623865316663 -63356163636238346230623732326232646434623532633439646536656362393162613535613565 -66616539316362376561386263373464623030636661663435383839643565393632616232663035 -34653735383964653930633664346330386566343830336238306562343164366131643138643339 -35313366356637643262636238366263353535306434633732623335643266396335666636666663 -37333232393765306433326164663538663839623034373535653737633366303665633831303334 -32303061363863386139613464326466336136396534663538643163343439343763383534306636 -62353733613330376163386331626463656462336237656339356132643135363537343638303261 -33366332653439313137613665386136666536356537346665333935366336623734393738346434 -63326265346362636564366265373134336662626332653464646139656635313961656230336537 -63666638326337643033363964643339666130386139363138656165666333356465643337396165 -30336330633632353231613938646165383966613863366330646162646266346139343434393865 -66346365663230626531643963383462636465363965393762336233366538393133313138616335 -32353834313762363265643031343237633732393166343139363163326439666162396332353038 -31306530626666343361313736313636613335376163383237303063393333386663333333336137 -37346166316231623638386635613230663063653037643930333961316434643361633035633734 -65643937636361653433383262643265373165613437336236633631323635613034663834646665 -30373730373438613132633932333565376665333565383932356334653738646166393934626362 -30666666303832613633316230623038343165396338343535663931383639623430643238656261 -39623037333063306266323335303736346236636137633863353866343136346335353865303961 -31346331333066376330306361396262333762393838303165383134303435353630366130303536 -34386532356239326166386665623435646432636561363564656161646563306234333138333839 -38316337656631313763393135396464643338386636336234346663653538353863643636323032 -35326133623064363838386662653138613438386564316635373838366262656364666633636539 -61306563666138656161336466323537626161313366616662623362643036636132663634313137 -39653437306662646162613763343736636530356465346132646238633166373838353836326461 -36326666323636353239303262623436643932353164323630326635653635653233363265316264 -30653763643431626539356161376534396437636463303363663134373961616561363561333333 -34306537326666383664336464656464623731656566653132613565336536323438666333366466 -64613738653730333633383062653837366266316536653139643362373039383831363666333934 -34383833336266356436666636323239336432386133303466636138643934356266326533643161 -36393664313963393930383533623565383332613933396639613037323266663439313138326261 -30353861303661303836343165353362663632306430626337356562343637653164396237333566 -37656230363530323836373363646334356262646633313932383161303264613238373936353036 -61376264633930356465626266623930333039383032316163633037323035346130343934616261 -31666166393462366561303833353135326566356637376466613934376233303162323033623031 -63656131333439353537623662363530383866326432306361316465383137633536666364623662 -37353561633839623530333663643130326131333330626661396636343234666139336539653162 -62383636663137626637303535333862366434626161353239393232313537343865646564626331 -39366665363030643764663963316163343033326434373265343664393439316333346434363563 -61346164396561343865626362616433306230333130653166656230353364316536626432373333 -35383133363530666263316431396462383133363965336637386632363263656261353963313161 -36383632326264373436383638383064346334336238656239393833653531656461356136303434 -37663434663732306631656334306361663562303863386135623066633963373034373139666332 -35393433646333363839666434663535363661616330386234366132303161383063663836626561 -35393064343735303032313266643338623834383838633834636536363539656466663864613366 -66636363623330326436363936313938333638323939323035616232366563316364343834376630 -66656434336661643861613737616138396330383832386230383331646462323363373363393733 -63363237636137373566363438663966396432613964336164326138623737393636396234646232 -64343361363365356135666235623833396131626663303839653535663732313831633163643638 -35396262373837343238343838663635353838373338663732626330613237623332336436643136 -38653833383430393837383566643765653834306636356466326364303334653034626262356630 -34333338333336373433356235386337346666343830303164363235303265313134323339653339 -63316238346132653663653165313635336638646362356337643766366564383531633565303431 -66616433663630343439336661346266336139613537653438653432326666326137306364376137 -66333939643262633532363966623439373434393862353237613135646663623236646331643537 -31353566653464313433636635393330646166613232633734346639326534373163383064353732 -32373861303064346266643338316465653031646633633936373738663837383162643534623131 -31633662356534343636313834386139656439663733333762323962323939623032396239356437 -37633739613433613365313337383835623936623530363831383535663337343264356532616434 -39393634396664636166346631313764343733666534613935393637363233373331303837656463 -37363266363634353136316532333462396266373733333633356239653334363835326261323661 -66323032346364356230613831643236316530356132343863393361343462373433383265336333 -30343730316366366234333263343965633466333439653739663333643939303631353664316435 -36396139623562656632666165666662626263643436396431326135633932393965656531633761 -39303634643936366438336534613532303134343164326661626363656562383564623264636132 -39656636303636393761653035303832386430646162343830343834316534636263373763643765 -61366335643531666232303231656336643833396238336639333437363564636566636632303364 -62623738336237393638363436396662656565653839643164356565313563663561666237383036 -33626464663465643230376164653062663063636630613064643632643235643662653566333333 -62353763643830363638323731303537633837393235656661333263323536363330356362643333 -34346666656432626365383639326538643862346265316263326531623631383962383734316330 -39333430613761663337306331623461643635653431343336663163343766373464366538313335 -61643538643231333636643836663663313534356662386532633331346664653262353839643066 -36393366653131316636646336313362656662666163333635633132323438353435373430643839 -37623936393962333065663536306238653466363634386632366637363265303734356535333735 -64623330303965393533326563643063303762646664666464643239386435343065326234306632 -35346338373866303838613933653230373737396134653533376265356432333933356237636338 -66656536393530316435323863373962636465333331653364626162326562393565313538633264 -34613633393862333731336563636136666166613037613833333063303162373339663539646631 -36303962356562306239616634376339356135666663303836353061663039343836356262373932 -65346466373532633365383835323062313531623130396130376531626333653862393462643631 -366330333666336262373364663864336633 +38333966306139633330626334636166336434613661376233313731353237353562376237323166 +3736663161376133336431353334306533316337633662310a646533656261633333306433626564 +33626434353661343431316632643938663639356531336564653230353439316236343861643665 +6231393530333937340a333033366564393536613330373232373861666439316536336164306633 +31373433653531363636663262616535643137363039356534313462653232663663343464303938 +33633838373935333433653732656261633463653835393864353862346563303063656431343065 +61323331363032383365613734373165343530303230373237346162306361613461353939623934 +64303138383537386435653461356130356563653036343339333761303030393933393735616531 +33386462303037613263373036386332656563346539633131366636333163376162613231313337 +64336137373038636233346539616136343933343635353639633633616438333739303864376162 +36656639313966633234323738326435373935363166626664613561636637396166353961623262 +31333064306537376631656235636265313235643339353735373666316364616432336536303830 +39393136393864383035633462366637396438323838643337633361373132363365616333613431 +30326533366265303165653761333034656261363862353061383761363530666135373265623332 +33373538616433383835663139383065366433333939356366353635633834666362646465366130 +31636235613934313465646136623834343062353539653163373032326130303034653365653431 +63306635323431376562396236653966633833396262343664643562366235393961316564656565 +61356436313363376233376137303062656462363933643465616436353964373837383536306136 +35626163393638353261633030653164643063626463383133666137323333633463616138643931 +38346633653430303031643830363166363561346336646666343330303164336164333561386535 +66616661306133626164343166303362383262636331313465343434643262353862313438616462 +33383734626463616330666265636265623064326635633066656533306530376663653366613534 +36666337346238333137303931633631366236373236383932343763653637343434336462343662 +64313239313435353365383338376133386639326136636164386439306665663965353565333030 +65363139636134656333616335643435643038373832383134636666303536663236303231313030 +61616664373264663763343334303437643264396435373230333561323036363764383730373461 +61636661316330373732363835303039346438313133393862306138613634333334356633346232 +63666132303939656465356665323435326435333135303735346332613134633736333338653066 +31616532616537343735326232613235323364386636396531383333316633666338306635656565 +63316338343032346261343863623163353934653434363336643836353431643937393261393339 +61363562373533396631623830613431663262643631663637396663626466663634323037666662 +65663132393863333135663831386132646533353535326430323864396132343762623464643461 +35306330666635343362316239386463633161623664653063356561356166613332363432393730 +33646439663039653037383630356166323733373963353239643231326338633838623033633339 +66666630306130336632333736396335666437383164633466373534333334356261383538353363 +30623461333365633536663236363661323835356361363331653437613131303732643134343038 +32663338356462343535396534646263656331366265356532616234663966626138633031323866 +33346662336534323037353835333032633965326163623365643230666339363566353938623931 +33316539396538333433373236656339396165313930613331396135666236326231336563343063 +34646233336137323166663635323266613635343363636334353865343931616665613462613764 +30323865623164303333333166393963613535616563316531383231313239666337343961333938 +34663931343535333830333036646463356132613064663037323237366563656239343665653263 +32343535653037633931653565663166623736306166623363316632316236663534383938656564 +32633734373336383630663436373863343136663337306364326432663763326561363961623464 +31326263623935343933333739373038373838616432646533316230613762336236306338616163 +34333266316537646439343937366261303833363665373734386632613733313435336438343534 +61393363396261666265396361313063636334623765613564393736616461313438613234333661 +64383764653464373131326332656435343163613561623762663532643130666338633736393931 +34316535376235616533353831343537363533346331316332323439383837303631626261316564 +30383566363737643065356565346161376637646431633732636333373862653966323461356535 +64613964666135373038656364376334336631376261373338643737633266393761623837643730 +38316233626130383231623930346338306164653336643066656665356463313131343738316230 +66316133306134656330643532303538373661333161343133613266333465663534326231306461 +65653634373934323432303833353339356531313164346238623639373363393137336334306131 +39393463613032633533363236323730386133356135383030656261363761333765383831646238 +35386164353462646236306337393364323665316364626265363736316638353266626665393662 +38663137626361366334373033643864613664656631616532373935313031343633373631323533 +34656561396463336662313834653634306435616439336161323763313732313331663436633663 +35323961393133343566623937313064646532643638336163633538613465363138653161386238 +62333139336537656339333737363933346333633534396230356561303063626266666661366130 +38626338353336616161373334306165333930646563613436303233666563636462643435396233 +62323634393063616461653134353133323664346566663664383766313939653036303930633331 +66353762623338303530633463336533373634333734653430303139366637373130306561653264 +34333533666437343732363036356132313230323838373233636631336434313563336366316466 +63393633363461393164323063396238346262623136623639383963616662323137633139323766 +31303765323730303863376166386631643031306130396338376538373362323335643964303137 +62626131656262613437383036636438383262396533646163363365326134633834666236333335 +65633037626335376230303937366463376664363062366361663362373434656637636230623561 +39626634343761303030346365633333333039326364303762326461316361343231363932323336 +39623033303232316263323433366638393435336563636138343261636561356363366138653033 +62373731623461363135383037613065396264333966353436613466663931343033326363323138 +62306133613163633134626138663434356562633936346239373837336439653061613762626533 +38623366313464393631666330353738393538366537313637613732613532663339653637616633 +65623637373230333738343136393332376364316438633164306539336233373065396339373562 +31383163316231356538626333323533663863383339643363303334323833353164356662326530 +34653630663330663330323864333965303236313266393636333839643863666236646665633137 +33353038626562663266386161393331326636353862643233326231623063623463313231373862 +31333639626232306339373435386562663035303633383333653066643361643139356134633264 +33363832353735633462363761343138323234356530656136636236623365353531356337393234 +37363133333763643863373338616532666464336238363631636131313261326164313430363434 +62363730623464343532653431353266336262363262373933646234653563663535363133343634 +64663535363231353738303663626166383831383531363130373466633532356635313530383432 +63636462656236303033376637643462616230626163373832666337636263333866313466616563 +62613162363633353235363039366365396662383335386165373233633539616530363264653266 +62643138333631353138336366646632386563353431343737363265353065373834326432623265 +30663630323361353635613363633032386465623139376630653038376536616462326134343363 +37643638323731313065653931663739306134323861313538313965636632653064393033376231 +36663666633836646636376166356361633961626466383030656162363362396566333832393439 +62306265386638333138363764646331643136636566343736613862343233303461633661643832 +35653839303039383233373532643632353964343365396131393933636537656334316466313531 +36633364643230336161316639313130316131663663393966333162373632386635393130313263 +64656439663135373265383732316435346135376563356630316662333664353564333038313730 +66346131396132366632306633656334376334653038646535383135636665396362343238346663 +36643132666434633730653431346265353662613265326230653333396239626633346633343231 +38303739303665343933633439623131333632383432343962653130396666373164633431653663 +35353264653833306163646164376234666364363766336564346332393831336537663936346433 +37346438353835353736316530323336336334376133663834363161326563353966356534333830 +64656164356661343462646536366234323062323164636434333863346337303661366164646562 +64383666343339346332643832616266346439353863616138613965373764333261356331316466 +62643939643461363238386463346638373630333437633737636630666161323461616539306634 +64646666626461306563393830396661313636633332396132363961373038386566646230323739 +62373064323761316135613538663132316365633339356664316365383234303635663435363239 +34336236663435643563376130396535623137333466363536393031303139356565313766656432 +64313365383631383034313831393462666437663733633165643230663539613630643264376631 +66653861313639666235613034633935633836656638643764343639373931366332373837343765 +61313765326362303963666165373364663664313631373136623437343837396165313930636165 +39323030303839333036393432383731303030643430643766383662366335386230623163303733 +37303232346534333433626330343637313534363562653133383966356538396638663762326530 +35336166393763626466323863663137386531356436306530323738373365643635613231636564 +62333839336137353833353036323533333163663331663033633938633533626637653538613038 +38613539303534366437633135616631303261643135616436653664326132356636653931306564 +62616434353733303863376361356465613531306534376333613261323764303137306266636434 +64363238633736643361393730626666656664333233616361643834373239623230303533343935 +31343362333735386338643433613333613736323639646562323437313733303331396136383762 +31663137386431386630343666663139363736313731323930313539313939623832313864386637 +66316531343238303936323234653033303666333233323334623837653665353565666335323638 +37363466373363333362656563383066366434306262323363336533356531363861356162326162 +66316135653963323765343934306630633132353036346536613663386339393632393764303530 +62333330306136346265306237393435353430313635393339363038313137623663316331656539 +31396361623230326433393239626536636437623737363131653363646237656165346463643338 +35306536376634336264643564346163373233666330393630633339346533653963346630396139 +36363430303866616334666631653732306230626238653463626132666638643938623030373538 +32353062626562396134393230386562346163643531376630616161646633333131383437386330 +34393665646530306663 diff --git a/inventories/production/host_vars/caddy.yml b/inventories/production/host_vars/caddy.yml index 6c7bc82..94184ae 100644 --- a/inventories/production/host_vars/caddy.yml +++ b/inventories/production/host_vars/caddy.yml @@ -1,4 +1,3 @@ ---- $ANSIBLE_VAULT;1.1;AES256 66633265383239626163633134656233613638643862323562373330643363323036333334646566 3439646635343533353432323064643135623532333738380a353866643461636233376432396434 diff --git a/inventories/production/host_vars/git-ci-01.yml b/inventories/production/host_vars/git-ci-01.yml index bb80d7e..c34b91c 100644 --- a/inventories/production/host_vars/git-ci-01.yml +++ b/inventories/production/host_vars/git-ci-01.yml @@ -1,8 +1,18 @@ --- -# Configure sudo path for git-ci-01 -# Sudo may not be in PATH for non-interactive shells +# git-ci-01 — Gitea Actions runner (VM 115 on pve201 @ 10.0.10.223) ansible_become_exe: /usr/bin/sudo ansible_become_method: sudo -# Alternative: if sudo is in a different location, update this -# ansible_become_exe: /usr/local/bin/sudo +# Proxmox (manual / qm): VMID 115, 2 cores, 4096 MB RAM, 64 GB disk (scsi0) +# act_runner: /etc/act_runner/config.yaml — capacity 2, force_pull false +# Maintenance: /etc/cron.weekly/docker-prune-ci (docker system prune -af --filter until=168h) +# +# Capacity notes (2026-05-23): +# - pve201: ~3 GB RAM free (125 Gi total, heavily overcommitted — GPU VM 104 @ 72 Gi) +# - capacity 3 needs ~8–12 GB RAM on this VM → migrate runner to pve10 or add RAM after freeing pve201 +# - 12 repos: capacity 2 on one runner is OK; second runner on pve10 if queues stack up + +git_ci_runner_capacity: 2 +git_ci_disk_gb: 64 +git_ci_proxmox_vmid: 115 +git_ci_proxmox_node: pve201 diff --git a/inventories/production/host_vars/hermes.yml b/inventories/production/host_vars/hermes.yml new file mode 100644 index 0000000..8337b6d --- /dev/null +++ b/inventories/production/host_vars/hermes.yml @@ -0,0 +1,4 @@ +--- +# Hermes agent VM 117 @ 10.0.10.36 (user: hermes, admin: ladmin) +# Secrets: vault_hermes_telegram_bot_token, mattermost in /home/hermes/.hermes/secrets/ +hermes_home: /home/hermes/.hermes diff --git a/inventories/production/host_vars/localhost.yml b/inventories/production/host_vars/localhost.yml new file mode 100644 index 0000000..2cfb893 --- /dev/null +++ b/inventories/production/host_vars/localhost.yml @@ -0,0 +1,4 @@ +--- +# Control node (runs playbooks with connection: local). +# Use project venv so API deps (proxmoxer, etc.) match `make bootstrap`. +ansible_python_interpreter: "{{ inventory_dir }}/../../.venv/bin/python3" diff --git a/inventories/production/host_vars/mailcow.yml b/inventories/production/host_vars/mailcow.yml new file mode 100644 index 0000000..a35b17a --- /dev/null +++ b/inventories/production/host_vars/mailcow.yml @@ -0,0 +1,7 @@ +--- +# Mailcow VM 106 on pve201 (Mailcow-debian) +# API/UI: https://mail.levkine.ca — domain levkine.ca (with e) +# SSH: root only (no ladmin). First access: make copy-ssh-key-mailcow +mailcow_url: "https://mail.levkine.ca" +mailcow_domain: "levkine.ca" +mailcow_alerts_user: "alerts" diff --git a/inventories/production/hosts b/inventories/production/hosts index e95619a..f4c6833 100644 --- a/inventories/production/hosts +++ b/inventories/production/hosts @@ -2,11 +2,22 @@ # Primary IPs: Tailscale (100.x.x.x) for remote access # Fallback IPs: Local network (10.0.x.x) when Tailscale is down # Usage: ansible_host_fallback is available for manual fallback +# Public URLs: levkin.ca DNS A records → Caddy (142.180.237.136), except home → 100.100.100.100 # # NOTE: Proxmox app projects (dev/qa/prod) are provisioned dynamically via # `playbooks/app/site.yml` (it uses `add_host` based on `app_projects`). # You generally do NOT need to add project hosts here. +[proxmox] +pve201 ansible_host=10.0.10.201 ansible_user=root +pve10 ansible_host=10.0.10.10 ansible_user=root + +[sites] +levkin ansible_host=10.0.10.60 ansible_user=root url=https://levkin.ca proxmox_vmid=220 proxmox_node=PVENAS +caseware ansible_host=10.0.10.105 ansible_user=root url=https://caseware.levkin.ca proxmox_vmid=215 proxmox_node=PVENAS +auto ansible_host=10.0.10.59 ansible_user=root url=https://auto.levkin.ca proxmox_vmid=216 proxmox_node=PVENAS +portfolio ansible_host=10.0.10.106 ansible_user=root url=https://iliadobkin.com proxmox_vmid=219 proxmox_node=PVENAS + [dev] dev01 ansible_host=10.0.30.105 ansible_user=ladmin bottom ansible_host=10.0.10.156 ansible_user=beast @@ -22,25 +33,33 @@ KrakenMint ansible_host=10.0.10.120 ansible_user=ladmin [ansible] ansibleVM ansible_host=10.0.10.157 ansible_user=master -[tailscale] -tailscaleVM ansible_host=100.66.218.53 ansible_user=ladmin +[comms] +# pve201 — email + newsletters +mailcow ansible_host=10.0.10.132 ansible_user=root url=https://mail.levkine.ca proxmox_vmid=106 proxmox_node=pve201 +listmonk ansible_host=10.0.10.148 ansible_user=root url=https://listmonk.levkin.ca proxmox_node=pve201 [services] -caddy ansible_host=10.0.10.50 ansible_user=root -jellyfin ansible_host=10.0.10.232 ansible_user=root -listmonk ansible_host=10.0.10.148 ansible_user=root -nextcloud ansible_host=10.0.10.25 ansible_user=root -actual ansible_host=10.0.10.158 ansible_user=root -vikanjans ansible_host=10.0.10.159 ansible_user=root -n8n ansible_host=10.0.10.154 ansible_user=root -giteaVM ansible_host=10.0.10.169 ansible_user=root -portainerVM ansible_host=10.0.30.69 ansible_user=ladmin -homepageVM ansible_host=10.0.30.12 ansible_user=homepage -vaultwardenVM ansible_host=10.0.10.142 ansible_user=ladmin +# VMID 117: on PVENAS (pve10) +hermes ansible_host=10.0.10.36 ansible_user=ladmin url=https://hermes.levkin.ca proxmox_vmid=117 proxmox_node=PVENAS +caddy ansible_host=10.0.10.50 ansible_user=ladmin proxmox_vmid=106 proxmox_node=PVENAS +cal ansible_host=10.0.10.228 ansible_user=root url=https://cal.levkin.ca proxmox_vmid=210 proxmox_node=PVENAS +identity ansible_host=10.0.10.21 ansible_user=root url=https://auth.levkin.ca proxmox_vmid=217 proxmox_node=PVENAS +monitoring ansible_host=10.0.10.22 ansible_user=root url=http://10.0.10.22:3001 proxmox_vmid=218 proxmox_node=PVENAS uptime_kuma_port=3001 dockge_port=5001 umami_port=3000 +giteaVM ansible_host=10.0.10.169 ansible_user=root url=https://git.levkin.ca proxmox_vmid=102 proxmox_node=PVENAS +n8n ansible_host=10.0.10.154 ansible_user=root url=https://n8n.levkin.ca proxmox_vmid=103 proxmox_node=PVENAS +vaultwardenVM ansible_host=10.0.10.142 ansible_user=ladmin url=https://vault.levkin.ca proxmox_vmid=104 proxmox_node=PVENAS +actual ansible_host=10.0.10.158 ansible_user=root url=https://budget.levkin.ca proxmox_vmid=108 proxmox_node=PVENAS +vikanjans ansible_host=10.0.10.159 ansible_user=root url=https://todo.levkin.ca qBittorrent ansible_host=10.0.10.91 ansible_user=root port=8080 +jellyfin ansible_host=10.0.10.232 ansible_user=root url=https://jelly.levkin.ca proxmox_vmid=101 proxmox_node=PVENAS # stopped until NAS pool healthy -[desktop] -desktop-beast ansible_host=100.117.34.106 ansible_user=beast +# Retired / stopped — kept for reference; do not run playbooks against these without intent +# nextcloud ansible_host=10.0.10.24 ansible_user=root url=https://nextcloud.levkin.ca # VM 201 decommission +# portainerVM ansible_host=10.0.30.69 ansible_user=ladmin # retired → Dockge on monitoring +# homepageVM ansible_host=10.0.30.12 ansible_user=homepage # VM 100 stopped on pve10 + +#[desktop] +#desktop-beast ansible_host=100.117.34.106 ansible_user=beast [local] localhost ansible_connection=local diff --git a/playbooks/caddy-auth-authentik.yml b/playbooks/caddy-auth-authentik.yml new file mode 100644 index 0000000..395cfed --- /dev/null +++ b/playbooks/caddy-auth-authentik.yml @@ -0,0 +1,53 @@ +--- +# Playbook: caddy-auth-authentik +# Purpose: Add auth.levkin.ca reverse proxy to Caddy (Phase 1 Authentik) +# Targets: caddy +# Usage: make -f Makefile caddy-auth OR ansible-playbook playbooks/caddy-auth-authentik.yml + +- name: Add Authentik proxy block to Caddy + hosts: caddy + become: true + become_method: ansible.builtin.su + + tasks: + - name: Ensure auth.levkin.ca HTTPS block exists (after cal block) + ansible.builtin.shell: | + set -euo pipefail + if grep -q '^auth\.levkin\.ca {' /etc/caddy/Caddyfile; then + exit 0 + fi + awk ' + /^cal\.levkin\.ca \{/ { in_cal=1 } + in_cal && /^}$/ && !done { + print + print "" + print "auth.levkin.ca {" + print " import security-headers" + print " encode gzip" + print " reverse_proxy 10.0.10.21:9000" + print "}" + done=1 + next + } + { print } + ' /etc/caddy/Caddyfile > /tmp/Caddyfile.new + mv /tmp/Caddyfile.new /etc/caddy/Caddyfile + args: + executable: /bin/bash + changed_when: true + notify: Reload caddy + + - name: Ensure auth.levkin.ca HTTP redirect in :80 block + ansible.builtin.blockinfile: + path: /etc/caddy/Caddyfile + marker: "# {mark} ANSIBLE MANAGED auth.levkin.ca :80" + insertafter: '@vault host vault.levkin.ca' + block: | + @auth host auth.levkin.ca + redir @auth https://auth.levkin.ca{uri} permanent + notify: Reload caddy + + handlers: + - name: Reload caddy + ansible.builtin.command: caddy reload --config /etc/caddy/Caddyfile + changed_when: true diff --git a/playbooks/caddy-levkin-site.yml b/playbooks/caddy-levkin-site.yml new file mode 100644 index 0000000..ad3c300 --- /dev/null +++ b/playbooks/caddy-levkin-site.yml @@ -0,0 +1,55 @@ +--- +# Playbook: caddy-levkin-site +# Purpose: Add levkin.ca reverse proxy to Caddy (site LXC 220) +# Targets: caddy +# Usage: make caddy-levkin + +- name: Add levkin.ca proxy block to Caddy + hosts: caddy + become: true + become_method: ansible.builtin.su + + tasks: + - name: Ensure levkin.ca HTTPS block exists (after caseware block) + ansible.builtin.shell: | + set -euo pipefail + if grep -q '^levkin\.ca,' /etc/caddy/Caddyfile || grep -q '^levkin\.ca {' /etc/caddy/Caddyfile; then + exit 0 + fi + awk -v upstream="{{ levkin_site_upstream | default('10.0.10.60:80') }}" ' + /^caseware\.levkin\.ca \{/ { in_cw=1 } + in_cw && /^}$/ && !done { + print + print "" + print "levkin.ca, www.levkin.ca {" + print " import security-headers" + print " @www host www.levkin.ca" + print " redir @www https://levkin.ca{uri} permanent" + print " reverse_proxy " upstream + print "}" + done=1 + next + } + { print } + ' /etc/caddy/Caddyfile > /tmp/Caddyfile.new + mv /tmp/Caddyfile.new /etc/caddy/Caddyfile + args: + executable: /bin/bash + register: levkin_https_block + changed_when: levkin_https_block.rc == 0 + notify: Reload caddy + + - name: Ensure levkin.ca HTTP redirect in :80 block + ansible.builtin.blockinfile: + path: /etc/caddy/Caddyfile + marker: "# {mark} ANSIBLE MANAGED levkin.ca :80" + insertafter: '@vikunja host todo.levkin.ca' + block: | + @levkin host levkin.ca www.levkin.ca + redir @levkin https://levkin.ca{uri} permanent + notify: Reload caddy + + handlers: + - name: Reload caddy + ansible.builtin.command: caddy reload --config /etc/caddy/Caddyfile + changed_when: true diff --git a/playbooks/ssh-keys.yml b/playbooks/ssh-keys.yml new file mode 100644 index 0000000..baf7d24 --- /dev/null +++ b/playbooks/ssh-keys.yml @@ -0,0 +1,20 @@ +--- +# Playbook: ssh-keys +# Purpose: Install your workstation SSH public key on all inventory hosts +# Targets: all hosts except localhost +# Usage: make copy-ssh-keys-ansible +# make copy-ssh-keys-ansible GROUP=services +# make copy-ssh-keys-ansible HOST=dev01 + +- name: Deploy workstation SSH public key + hosts: all:!local + gather_facts: false + vars: + ssh_public_key_file: "{{ lookup('env', 'SSH_PUBLIC_KEY') | default(lookup('env', 'HOME') + '/.ssh/id_ed25519.pub', true) }}" + tasks: + - name: Add SSH public key for ansible_user + ansible.posix.authorized_key: + user: "{{ ansible_user | default(ansible_user_id) }}" + state: present + key: "{{ lookup('file', ssh_public_key_file) }}" + become: false diff --git a/scripts/bootstrap-root-ssh-su-password.sh b/scripts/bootstrap-root-ssh-su-password.sh new file mode 100755 index 0000000..99ee3ac --- /dev/null +++ b/scripts/bootstrap-root-ssh-su-password.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# Bootstrap root SSH when `su` needs a password (no sudo on host). +# Usage: BOOTSTRAP_SU_PASSWORD='...' ./scripts/bootstrap-root-ssh-su-password.sh HOST +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +HOST="${1:-}" +BOOTSTRAP_USER="${BOOTSTRAP_USER:-ladmin}" +PUBKEY_FILE="${SSH_PUBLIC_KEY:-${HOME}/.ssh/id_ed25519.pub}" +SU_PASSWORD="${BOOTSTRAP_SU_PASSWORD:-}" + +[[ -n "${HOST}" ]] || { echo "Usage: $0 HOST" >&2; exit 1; } +[[ -n "${SU_PASSWORD}" ]] || { echo "Set BOOTSTRAP_SU_PASSWORD" >&2; exit 1; } +[[ -f "${PUBKEY_FILE}" ]] || { echo "Missing ${PUBKEY_FILE}" >&2; exit 1; } + +IP="$(awk -v h="${HOST}" '$1==h {for(i=2;i<=NF;i++) if($i~/^ansible_host=/) {sub(/ansible_host=/,"",$i); print $i; exit}}' \ + "${REPO_ROOT}/inventories/production/hosts")" +[[ -n "${IP}" ]] || { echo "No ansible_host for ${HOST}" >&2; exit 1; } + +PUBKEY="$(cat "${PUBKEY_FILE}")" +export IP BOOTSTRAP_USER SU_PASSWORD PUBKEY + +/usr/bin/expect <<'EXPECT' +set timeout 60 +spawn ssh -o StrictHostKeyChecking=accept-new $env(BOOTSTRAP_USER)@$env(IP) +expect { + -re {[$#] $} { } + timeout { exit 1 } +} +send "su -\r" +expect { + "Password:" { + send "$env(SU_PASSWORD)\r" + } + timeout { exit 1 } +} +expect { + -re {root@caddy|#|❯|[$#] $} { } + timeout { exit 1 } +} +send "bash --noprofile --norc\r" +expect { + -re {# $} { } + timeout { exit 1 } +} +send "mkdir -p /root/.ssh && chmod 700 /root/.ssh && touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys\r" +expect -re {# $} +send "grep -qF '$env(PUBKEY)' /root/.ssh/authorized_keys || echo '$env(PUBKEY)' >> /root/.ssh/authorized_keys\r" +expect -re {# $} +send "sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config 2>/dev/null || echo PermitRootLogin prohibit-password >> /etc/ssh/sshd_config\r" +expect -re {# $} +send "systemctl restart ssh 2>/dev/null || systemctl restart sshd 2>/dev/null || true\r" +expect -re {# $} +send "exit\r" +expect eof +EXPECT + +ssh -o BatchMode=yes -i "${PUBKEY_FILE}" -o ConnectTimeout=10 \ + "root@${IP}" "echo OK: root@${IP}" +echo "Done: root key on ${HOST}" diff --git a/scripts/bootstrap-root-ssh.sh b/scripts/bootstrap-root-ssh.sh new file mode 100755 index 0000000..69fbde7 --- /dev/null +++ b/scripts/bootstrap-root-ssh.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# Bootstrap root SSH key access via a normal user (default: ladmin). +# Usage: ./scripts/bootstrap-root-ssh.sh HOSTNAME +# BOOTSTRAP_USER=ladmin TARGET_USER=root SSH_PUBLIC_KEY=~/.ssh/id_ed25519.pub + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +INVENTORY_HOSTS="${INVENTORY_HOSTS:-${REPO_ROOT}/inventories/production/hosts}" +PUBKEY_FILE="${SSH_PUBLIC_KEY:-${HOME}/.ssh/id_ed25519.pub}" +BOOTSTRAP_USER="${BOOTSTRAP_USER:-ladmin}" +TARGET_USER="${TARGET_USER:-root}" +HOST="${1:-}" + +if [[ -z "${HOST}" ]]; then + echo "Usage: $0 HOST" >&2 + exit 1 +fi + +if [[ ! -f "${PUBKEY_FILE}" ]]; then + echo "Public key not found: ${PUBKEY_FILE}" >&2 + exit 1 +fi + +resolve_from_inventory() { + awk -v host="${HOST}" ' + $1 == host { + for (i = 2; i <= NF; i++) { + if ($i ~ /^ansible_host=/) { + sub(/ansible_host=/, "", $i) + ip = $i + } + if ($i ~ /^ansible_user=/) { + sub(/ansible_user=/, "", $i) + user = $i + } + } + } + END { + print ip + print user + } + ' "${INVENTORY_HOSTS}" +} + +IP="$(resolve_from_inventory | sed -n '1p')" +INV_USER="$(resolve_from_inventory | sed -n '2p')" + +if [[ -z "${IP}" ]]; then + echo "Could not resolve ansible_host for ${HOST} in ${INVENTORY_HOSTS}" >&2 + exit 1 +fi + +echo "==> ${HOST} (${BOOTSTRAP_USER}@${IP} -> ${TARGET_USER})" +echo " Inventory ansible_user: ${INV_USER:-}" +echo " Public key: ${PUBKEY_FILE}" +echo "" + +echo "Step 1/3: Install key for ${BOOTSTRAP_USER} (password: ${BOOTSTRAP_USER})" +ssh-copy-id -i "${PUBKEY_FILE}" -o StrictHostKeyChecking=accept-new \ + "${BOOTSTRAP_USER}@${IP}" + +echo "" +echo "Step 2/3: Copy key and configure ${TARGET_USER} via su (password: root)" +REMOTE_KEY="/tmp/ansible-bootstrap.pub" +scp -o StrictHostKeyChecking=accept-new "${PUBKEY_FILE}" \ + "${BOOTSTRAP_USER}@${IP}:${REMOTE_KEY}" + +ssh -t "${BOOTSTRAP_USER}@${IP}" bash -s </dev/null; then + cat "\${REMOTE_KEY}" >> /root/.ssh/authorized_keys +fi +rm -f "\${REMOTE_KEY}" +if [ -f /etc/ssh/sshd_config ]; then + if grep -q '^PermitRootLogin' /etc/ssh/sshd_config; then + sed -i 's/^#*PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config + else + echo 'PermitRootLogin prohibit-password' >> /etc/ssh/sshd_config + fi + systemctl restart ssh 2>/dev/null \ + || systemctl restart sshd 2>/dev/null \ + || service ssh restart 2>/dev/null \ + || true +fi +echo "OK: root authorized_keys updated; PermitRootLogin prohibit-password" +ROOT_SCRIPT +REMOTE_SCRIPT + +echo "" +echo "Step 3/3: Verify ${TARGET_USER} key login" +ssh -o BatchMode=yes -i "${PUBKEY_FILE}" -o StrictHostKeyChecking=accept-new \ + "${TARGET_USER}@${IP}" "echo OK: ${TARGET_USER}@${IP} accepts your SSH key" + +echo "" +echo "Done: ${HOST} — use: ssh -i ${PUBKEY_FILE} ${TARGET_USER}@${IP}" diff --git a/scripts/kuma-setup-smtp.sh b/scripts/kuma-setup-smtp.sh new file mode 100755 index 0000000..5eb0589 --- /dev/null +++ b/scripts/kuma-setup-smtp.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Configure Uptime Kuma SMTP notification (Mailcow) via Socket.IO API. +# Run from machine with network access to Kuma: +# export KUMA_URL=http://10.0.10.22:3001 +# export KUMA_USER=admin +# export KUMA_PASSWORD='your-kuma-password' +# export SMTP_USER=alerts@levkine.ca +# export SMTP_PASS='mailbox-password' +# export SMTP_TO=idobkin@gmail.com +# pip install uptime-kuma-api +# ./scripts/kuma-setup-smtp.sh + +set -euo pipefail + +KUMA_URL="${KUMA_URL:-http://10.0.10.22:3001}" +KUMA_USER="${KUMA_USER:-admin}" +KUMA_PASSWORD="${KUMA_PASSWORD:-}" +SMTP_HOST="${SMTP_HOST:-mail.levkine.ca}" +SMTP_PORT="${SMTP_PORT:-587}" +SMTP_USER="${SMTP_USER:-alerts@levkine.ca}" +SMTP_PASS="${SMTP_PASS:-}" +SMTP_TO="${SMTP_TO:-idobkin@gmail.com}" + +if [[ -z "${KUMA_PASSWORD}" || -z "${SMTP_PASS}" ]]; then + echo "Set KUMA_PASSWORD and SMTP_PASS" >&2 + exit 1 +fi + +python3 <<'PY' +import os +import sys + +try: + from uptime_kuma_api import UptimeKumaApi +except ImportError: + print("pip install uptime-kuma-api", file=sys.stderr) + sys.exit(1) + +url = os.environ["KUMA_URL"] +user = os.environ["KUMA_USER"] +password = os.environ["KUMA_PASSWORD"] +smtp_host = os.environ["SMTP_HOST"] +smtp_port = int(os.environ["SMTP_PORT"]) +smtp_user = os.environ["SMTP_USER"] +smtp_pass = os.environ["SMTP_PASS"] +smtp_to = os.environ["SMTP_TO"] + +with UptimeKumaApi(url) as api: + api.login(user, password) + # Notification type name in Kuma 1.x is often 'smtp' / 'email' + result = api.add_notification( + name="Mailcow alerts", + type="smtp", + isDefault=True, + applyExisting=True, + smtpHost=smtp_host, + smtpPort=smtp_port, + smtpSecure=True, + smtpIgnoreTLS=False, + smtpUsername=smtp_user, + smtpPassword=smtp_pass, + smtpFrom=smtp_user, + smtpTo=smtp_to, + ) + print(result) +PY diff --git a/scripts/load-mailcow-vault-env.sh b/scripts/load-mailcow-vault-env.sh new file mode 100755 index 0000000..87b2061 --- /dev/null +++ b/scripts/load-mailcow-vault-env.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Export Mailcow API + mailbox password from .env or Ansible vault. +# Usage: source scripts/load-mailcow-vault-env.sh [mailbox_local_part] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +VAULT_FILE="${REPO_ROOT}/inventories/production/group_vars/all/vault.yml" +VAULT_PASS="${HOME}/.ansible-vault-pass" +ANSIBLE_VAULT="${REPO_ROOT}/.venv/bin/ansible-vault" +MAILBOX_KEY="${1:-${MAILBOX:-${MAILBOX_LOCAL_PART:-}}}" + +set -a +[ -f "${REPO_ROOT}/.env" ] && . "${REPO_ROOT}/.env" +set +a + +if [[ -n "${MAILCOW_API_KEY:-}" && -n "${MAILBOX_PASSWORD:-${ALERTS_PASSWORD:-}}" ]]; then + export MAILBOX_PASSWORD="${MAILBOX_PASSWORD:-${ALERTS_PASSWORD:-}}" + return 0 2>/dev/null || exit 0 +fi + +if [[ ! -f "${VAULT_FILE}" ]] || [[ ! -f "${VAULT_PASS}" ]]; then + return 0 2>/dev/null || exit 0 +fi + +eval "$("${REPO_ROOT}/.venv/bin/python3" - "${VAULT_FILE}" "${VAULT_PASS}" "${ANSIBLE_VAULT}" "${MAILBOX_KEY}" <<'PY' +import os, subprocess, sys, yaml, shlex + +vault_file, vault_pass, ansible_vault, mailbox_key = sys.argv[1:5] +text = subprocess.check_output( + [ansible_vault, "view", vault_file, "--vault-password-file", vault_pass], + text=True, +) +data = yaml.safe_load(text) or {} +out = [] +api = data.get("vault_mailcow_api_key") or "" +if api: + out.append("export MAILCOW_API_KEY=" + shlex.quote(str(api))) +passwords = data.get("vault_mailcow_mailbox_passwords") or {} +pw = "" +if mailbox_key and mailbox_key in passwords: + pw = passwords[mailbox_key] +elif mailbox_key == "alerts": + pw = data.get("vault_alerts_mailbox_password") or passwords.get("alerts", "") +if pw: + out.append("export MAILBOX_PASSWORD=" + shlex.quote(str(pw))) + out.append("export ALERTS_PASSWORD=" + shlex.quote(str(pw))) +print("\n".join(out)) +PY +)" + +return 0 2>/dev/null || exit 0 diff --git a/scripts/load-vault-lxc-root-password.sh b/scripts/load-vault-lxc-root-password.sh new file mode 100755 index 0000000..f319ab8 --- /dev/null +++ b/scripts/load-vault-lxc-root-password.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Export BOOTSTRAP_SU_PASSWORD from vault_lxc_root_password +set -euo pipefail +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +eval "$("${REPO_ROOT}/.venv/bin/python3" - "${REPO_ROOT}" <<'PY' +import os, subprocess, sys, yaml, shlex +repo = sys.argv[1] +text = subprocess.check_output( + [os.path.join(repo, ".venv/bin/ansible-vault"), "view", + os.path.join(repo, "inventories/production/group_vars/all/vault.yml"), + "--vault-password-file", os.path.expanduser("~/.ansible-vault-pass")], + text=True, +) +pw = (yaml.safe_load(text) or {}).get("vault_lxc_root_password", "") +if pw: + print("export BOOTSTRAP_SU_PASSWORD=" + shlex.quote(str(pw))) +PY +)" diff --git a/scripts/mailcow-mailbox-from-inventory.sh b/scripts/mailcow-mailbox-from-inventory.sh new file mode 100755 index 0000000..6585ce2 --- /dev/null +++ b/scripts/mailcow-mailbox-from-inventory.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Resolve MAILBOX= key from inventories/production/group_vars/all/mailcow.yml +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +MAILBOX="${MAILBOX:-}" +[[ -n "${MAILBOX}" ]] || { echo "MAILBOX required" >&2; exit 1; } + +eval "$("${REPO_ROOT}/.venv/bin/python3" - "${REPO_ROOT}" "${MAILBOX}" <<'PY' +import sys, yaml, shlex, os + +repo, key = sys.argv[1], sys.argv[2] +path = os.path.join(repo, "inventories/production/group_vars/all/mailcow.yml") +with open(path) as f: + data = yaml.safe_load(f) or {} +boxes = data.get("mailcow_mailboxes") or {} +if key not in boxes: + raise SystemExit(f"Unknown MAILBOX={key!r}. Add it to mailcow_mailboxes in mailcow.yml") +b = boxes[key] +out = [] +for k, env in [ + ("local_part", "MAILBOX_LOCAL_PART"), + ("name", "MAILBOX_NAME"), + ("quota", "MAILBOX_QUOTA"), +]: + if k in b and b[k] is not None: + out.append(f"export {env}={shlex.quote(str(b[k]))}") +if b.get("vault_password_key"): + out.append(f"export MAILBOX_VAULT_KEY={shlex.quote(str(b['vault_password_key']))}") +print("\n".join(out)) +PY +)" diff --git a/scripts/mailcow-mailbox.sh b/scripts/mailcow-mailbox.sh new file mode 100755 index 0000000..3bba69f --- /dev/null +++ b/scripts/mailcow-mailbox.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Create or update a Mailcow mailbox via API. +# +# Usage: +# make mailcow-mailbox MAILBOX=alerts +# # or with env (after: source scripts/load-mailcow-vault-env.sh): +# MAILBOX_LOCAL_PART=notify MAILBOX_NAME="Notify" MAILBOX_PASSWORD='...' ./scripts/mailcow-mailbox.sh +# +# Variables (env or make): +# MAILBOX / MAILBOX_LOCAL_PART — local part (required) +# MAILBOX_NAME — display name (default: title-case of local part) +# MAILBOX_PASSWORD — if unset, loaded from vault_mailcow_mailbox_passwords[local_part] +# MAILBOX_QUOTA — MiB (default 1024) +# MAILCOW_URL, MAILCOW_DOMAIN, MAILCOW_API_KEY — see load-mailcow-vault-env.sh + +set -euo pipefail + +MAILCOW_URL="${MAILCOW_URL:-https://mail.levkine.ca}" +DOMAIN="${MAILCOW_DOMAIN:-levkine.ca}" +LOCAL_PART="${MAILBOX_LOCAL_PART:-${MAILBOX:-}}" +API_KEY="${MAILCOW_API_KEY:-}" +MAILBOX_PASSWORD="${MAILBOX_PASSWORD:-${ALERTS_PASSWORD:-}}" +QUOTA="${MAILBOX_QUOTA:-1024}" + +if [[ -z "${LOCAL_PART}" ]]; then + echo "Set MAILBOX=localpart or MAILBOX_LOCAL_PART" >&2 + exit 1 +fi + +if [[ -z "${API_KEY}" ]]; then + echo "Set MAILCOW_API_KEY (make mailcow-mailbox loads vault/.env)" >&2 + exit 1 +fi + +if [[ -z "${MAILBOX_PASSWORD}" ]]; then + echo "Set MAILBOX_PASSWORD or add vault_mailcow_mailbox_passwords.${LOCAL_PART} in vault" >&2 + exit 1 +fi + +DISPLAY_NAME="${MAILBOX_NAME:-$(echo "${LOCAL_PART}" | sed 's/[-_]/ /g' | awk '{for(i=1;i<=NF;i++) $i=toupper(substr($i,1,1)) tolower(substr($i,2)); print}')}" + +ATTR=$(jq -nc \ + --arg lp "${LOCAL_PART}" \ + --arg dom "${DOMAIN}" \ + --arg name "${DISPLAY_NAME}" \ + --arg pw "${MAILBOX_PASSWORD}" \ + --arg quota "${QUOTA}" \ + '{local_part:$lp,domain:$dom,name:$name,quota:$quota,password:$pw,password2:$pw,active:"1"}') + +echo "Creating mailbox ${LOCAL_PART}@${DOMAIN} (${DISPLAY_NAME})..." +RESP=$(curl -sk -w "\n%{http_code}" -X POST "${MAILCOW_URL}/api/v1/add/mailbox" \ + -H "X-API-Key: ${API_KEY}" \ + -d "attr=${ATTR}") +HTTP_CODE=$(echo "${RESP}" | tail -1) +BODY=$(echo "${RESP}" | sed '$d') +echo "${BODY}" | jq . 2>/dev/null || echo "${BODY}" +if [[ "${HTTP_CODE}" -lt 200 || "${HTTP_CODE}" -ge 300 ]]; then + echo "Mailcow API HTTP ${HTTP_CODE}" >&2 + exit 1 +fi + +echo "Done: ${LOCAL_PART}@${DOMAIN}" diff --git a/scripts/run-mailcow-mailbox.sh b/scripts/run-mailcow-mailbox.sh new file mode 100755 index 0000000..7d2ed09 --- /dev/null +++ b/scripts/run-mailcow-mailbox.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Wrapper for: make mailcow-mailbox MAILBOX=name +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +MAILBOX="${MAILBOX:?MAILBOX required}" + +cd "${REPO_ROOT}" +eval "$(./scripts/mailcow-mailbox-from-inventory.sh)" +. ./scripts/load-mailcow-vault-env.sh "${MAILBOX_VAULT_KEY:-${MAILBOX}}" + +if [[ -z "${MAILCOW_API_KEY:-}" || -z "${MAILBOX_PASSWORD:-}" ]]; then + echo "Missing vault_mailcow_api_key or vault_mailcow_mailbox_passwords.${MAILBOX}" >&2 + exit 1 +fi + +exec ./scripts/mailcow-mailbox.sh diff --git a/scripts/security-audit-lxc-via-pve.sh b/scripts/security-audit-lxc-via-pve.sh new file mode 100755 index 0000000..0798953 --- /dev/null +++ b/scripts/security-audit-lxc-via-pve.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Audit LXCs on a Proxmox node via pct exec (run ON the PVE host as root). +set -u + +AUDIT='#!/bin/bash +echo "=== identity ===" +hostname -f 2>/dev/null || hostname +[ -f /etc/os-release ] && . /etc/os-release && echo "os=${PRETTY_NAME:-unknown}" +echo "ip=$(hostname -I 2>/dev/null | awk "{print \$1}")" +echo "=== sshd (effective) ===" +if command -v sshd >/dev/null 2>&1; then + sshd -T 2>/dev/null | grep -E "^(permitrootlogin|passwordauthentication|pubkeyauthentication|permitemptypasswords|port) " || true +else + grep -E "^(PermitRootLogin|PasswordAuthentication|PubkeyAuthentication|Port) " /etc/ssh/sshd_config 2>/dev/null | grep -v "^#" || echo "sshd not installed" +fi +echo "=== firewall ===" +ufw status 2>/dev/null | head -3 || echo "no ufw" +echo "=== fail2ban ===" +systemctl is-active fail2ban 2>/dev/null || echo "inactive/missing" +echo "=== pending upgrades ===" +apt-get -s upgrade 2>/dev/null | grep -c "^Inst" || echo 0 +echo "=== public listeners ===" +ss -tlnp 2>/dev/null | grep LISTEN | grep -v "127.0.0.1:" | grep -v "\[::1\]:" | head -12 +' + +echo "PVE_NODE=$(hostname -f 2>/dev/null || hostname)" +echo "PVE_IP=$(hostname -I | awk '{print $1}')" + +for id in $(pct list 2>/dev/null | awk 'NR>1 {print $1}'); do + name=$(pct list | awk -v id="$id" '$1==id {print $4}') + status=$(pct list | awk -v id="$id" '$1==id {print $2}') + echo "" + echo "######## LXC vmid=$id name=$name status=$status ########" + if [ "$status" != "running" ]; then + echo "SKIP: not running" + continue + fi + pct exec "$id" -- bash -c "$AUDIT" 2>&1 || echo "ERROR: pct exec failed" +done diff --git a/scripts/security-audit-remote.sh b/scripts/security-audit-remote.sh new file mode 100755 index 0000000..516a354 --- /dev/null +++ b/scripts/security-audit-remote.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Quick read-only security snapshot (run on target host). +set -euo pipefail + +echo "=== identity ===" +hostname -f 2>/dev/null || hostname +if [ -f /etc/os-release ]; then . /etc/os-release; echo "os=${PRETTY_NAME:-unknown}"; fi +echo "kernel=$(uname -r)" +echo "uptime=$(uptime -p 2>/dev/null || uptime)" + +echo "=== sshd (effective) ===" +if command -v sshd >/dev/null 2>&1; then + sshd -T 2>/dev/null | grep -E '^(permitrootlogin|passwordauthentication|pubkeyauthentication|permitemptypasswords|port|x11forwarding|allowtcpforwarding) ' || true +else + grep -E '^(PermitRootLogin|PasswordAuthentication|PubkeyAuthentication|Port) ' /etc/ssh/sshd_config 2>/dev/null | grep -v '^#' || echo "sshd not found" +fi + +echo "=== firewall ===" +if command -v ufw >/dev/null 2>&1; then + ufw status verbose 2>/dev/null | head -8 +elif command -v firewall-cmd >/dev/null 2>&1; then + firewall-cmd --state 2>/dev/null || true +else + echo "no ufw/firewalld" +fi + +echo "=== fail2ban ===" +systemctl is-active fail2ban 2>/dev/null || echo "fail2ban: inactive or missing" + +echo "=== unattended-upgrades ===" +systemctl is-active unattended-upgrades 2>/dev/null || echo "unattended-upgrades: inactive or missing" + +echo "=== pending apt upgrades ===" +if command -v apt >/dev/null 2>&1; then + apt-get -s upgrade 2>/dev/null | grep -c '^Inst' || echo 0 +else + echo "n/a" +fi + +echo "=== listening tcp (public) ===" +ss -tlnp 2>/dev/null | awk 'NR==1 || /LISTEN/ {print}' | grep -v '127.0.0.1:' | grep -v '\[::1\]:' | head -20 + +echo "=== uid 0 accounts ===" +awk -F: '$3==0 {print $1}' /etc/passwd | tr '\n' ' ' +echo + +echo "=== last logins (top 5) ===" +last -n 5 2>/dev/null | head -5 || true diff --git a/scripts/security-audit-ssh.sh b/scripts/security-audit-ssh.sh new file mode 100755 index 0000000..1bac6e5 --- /dev/null +++ b/scripts/security-audit-ssh.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# SSH-focused audit (hypervisor or guest). +set -u + +echo "=== host ===" +hostname -f 2>/dev/null || hostname + +echo "=== sshd effective config ===" +if command -v sshd >/dev/null 2>&1; then + sshd -T 2>/dev/null | grep -E '^(port|permitrootlogin|passwordauthentication|pubkeyauthentication|permitemptypasswords|maxauthtries|x11forwarding|allowtcpforwarding|gatewayports|permittunnel|usepam|kbdinteractiveauthentication) ' || true +else + echo "sshd binary missing" +fi + +echo "=== sshd_config (non-comment) ===" +grep -E '^(Port|PermitRootLogin|PasswordAuthentication|PubkeyAuthentication|PermitEmptyPasswords|MaxAuthTries|AllowUsers|AllowGroups|X11Forwarding) ' /etc/ssh/sshd_config 2>/dev/null || true + +echo "=== authorized_keys (root) ===" +if [ -f /root/.ssh/authorized_keys ]; then + wc -l /root/.ssh/authorized_keys + awk '{print $NF}' /root/.ssh/authorized_keys 2>/dev/null | sed 's/^/ key: /' +else + echo "no /root/.ssh/authorized_keys" +fi + +echo "=== recent ssh auth failures (today) ===" +journalctl -u ssh -u sshd --since today 2>/dev/null | grep -iE 'Failed|Invalid|refused' | tail -5 || grep -iE 'Failed|Invalid' /var/log/auth.log 2>/dev/null | tail -5 || echo "no logs" diff --git a/scripts/vault-export-env.sh b/scripts/vault-export-env.sh new file mode 100755 index 0000000..05f773c --- /dev/null +++ b/scripts/vault-export-env.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# Write Ansible vault secrets into .env (for local scripts / reference). +# Does not print secret values. Does not overwrite non-empty .env keys. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ENV_FILE="${1:-${REPO_ROOT}/.env}" +VAULT_FILE="${REPO_ROOT}/inventories/production/group_vars/all/vault.yml" +VAULT_PASS="${HOME}/.ansible-vault-pass" +ANSIBLE_VAULT="${REPO_ROOT}/.venv/bin/ansible-vault" + +[[ -f "${VAULT_PASS}" ]] || { echo "Missing ${VAULT_PASS}" >&2; exit 1; } + +"${REPO_ROOT}/.venv/bin/python3" - "${ENV_FILE}" "${VAULT_FILE}" "${VAULT_PASS}" "${ANSIBLE_VAULT}" <<'PY' +import subprocess, sys, yaml +from pathlib import Path + +env_file, vault_file, vault_pass, ansible_vault = sys.argv[1:5] + +# vault key -> .env key +MAP = { + "vault_mailcow_api_key": "MAILCOW_API_KEY", + "vault_alerts_mailbox_password": "ALERTS_PASSWORD", + "vault_uptime_kuma_password": "KUMA_PASSWORD", + "vault_uptime_kuma_user": "KUMA_USER", + "vault_uptime_kuma_url": "KUMA_URL", + "vault_umami_admin_password": "UMAMI_ADMIN_PASSWORD", + "vault_umami_db_password": "UMAMI_DB_PASS", + "vault_umami_app_secret": "UMAMI_APP_SECRET", + "vault_kuma_smtp_host": "SMTP_HOST", + "vault_kuma_smtp_port": "SMTP_PORT", + "vault_kuma_smtp_user": "SMTP_USER", + "vault_kuma_smtp_password": "SMTP_PASS", + "vault_kuma_smtp_to": "SMTP_TO", + "vault_mattermost_url": "MATTERMOST_URL", + "vault_mattermost_token": "MATTERMOST_TOKEN", + "vault_mattermost_allowed_users": "MATTERMOST_ALLOWED_USERS", +} + +def parse_env(text): + d = {} + for line in text.splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, _, v = line.partition("=") + d[k.strip()] = v.strip().strip("'").strip('"') + return d + +text = subprocess.check_output( + [ansible_vault, "view", vault_file, "--vault-password-file", vault_pass], + text=True, +) +data = yaml.safe_load(text) or {} +existing = parse_env(Path(env_file).read_text()) if Path(env_file).exists() else {} +merged = dict(existing) + +for vk, ek in MAP.items(): + val = data.get(vk) + if val is None or val == "": + continue + if merged.get(ek): + continue + merged[ek] = str(val) + +pw = data.get("vault_mailcow_mailbox_passwords") or {} +if pw.get("alerts") and not merged.get("ALERTS_PASSWORD"): + merged["ALERTS_PASSWORD"] = str(pw["alerts"]) + +header = """# Merged from Ansible vault (make vault-export-env). Fill gaps manually. +# vault → .env: make vault-export-env +# .env → vault: make vault-import-env +# hosts → .env → vault: make vault-pull-infra-secrets + +""" +body = "\n".join(f"{k}={v}" for k, v in sorted(merged.items())) + "\n" +Path(env_file).write_text(header + body) +print(f"Wrote {len(merged)} keys to {env_file} (existing non-empty keys kept)") +PY + +chmod 600 "${ENV_FILE}" 2>/dev/null || true diff --git a/scripts/vault-import-env.sh b/scripts/vault-import-env.sh new file mode 100755 index 0000000..59e0b7e --- /dev/null +++ b/scripts/vault-import-env.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# Merge .env into inventories/production/group_vars/all/vault.yml +# Usage: make vault-import-env [ENV_FILE=.env] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ENV_FILE="${1:-${ENV_FILE:-${REPO_ROOT}/.env}}" +VAULT_FILE="${REPO_ROOT}/inventories/production/group_vars/all/vault.yml" +VAULT_PASS="${HOME}/.ansible-vault-pass" +ANSIBLE_VAULT="${REPO_ROOT}/.venv/bin/ansible-vault" + +[[ -f "${ENV_FILE}" ]] || { echo "No env file: ${ENV_FILE}" >&2; exit 1; } +[[ -f "${VAULT_PASS}" ]] || { echo "Missing ${VAULT_PASS}" >&2; exit 1; } + +"${REPO_ROOT}/.venv/bin/python3" - "${ENV_FILE}" "${VAULT_FILE}" "${VAULT_PASS}" "${ANSIBLE_VAULT}" <<'PY' +import os, re, subprocess, sys, tempfile, yaml + +env_file, vault_file, vault_pass, ansible_vault = sys.argv[1:5] + +def load_env(path): + out = {} + with open(path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if line.startswith("export "): + line = line[7:].strip() + if "=" not in line: + continue + k, _, v = line.partition("=") + v = v.strip().strip("'").strip('"') + if v: + out[k.strip()] = v + return out + +# .env key -> vault key (or vault_mailcow_mailbox_passwords.) +MAP = { + "MAILCOW_API_KEY": "vault_mailcow_api_key", + "ALERTS_PASSWORD": ("vault_alerts_mailbox_password", "alerts"), + "KUMA_PASSWORD": "vault_uptime_kuma_password", + "KUMA_USER": "vault_uptime_kuma_user", + "KUMA_URL": "vault_uptime_kuma_url", + "UMAMI_ADMIN_PASSWORD": "vault_umami_admin_password", + "UMAMI_DB_PASS": "vault_umami_db_password", + "UMAMI_APP_SECRET": "vault_umami_app_secret", + "SMTP_HOST": "vault_kuma_smtp_host", + "SMTP_PORT": "vault_kuma_smtp_port", + "SMTP_USER": "vault_kuma_smtp_user", + "SMTP_PASS": "vault_kuma_smtp_password", + "SMTP_TO": "vault_kuma_smtp_to", + "MATTERMOST_URL": "vault_mattermost_url", + "MATTERMOST_TOKEN": "vault_mattermost_token", + "MATTERMOST_ALLOWED_USERS": "vault_mattermost_allowed_users", + "PROXMOX_PASSWORD": "vault_proxmox_password", + "LXC_ROOT_PASSWORD": "vault_lxc_root_password", +} + +env = load_env(env_file) +text = subprocess.check_output( + [ansible_vault, "view", vault_file, "--vault-password-file", vault_pass], + text=True, +) +data = yaml.safe_load(text) or {} +passwords = dict(data.get("vault_mailcow_mailbox_passwords") or {}) + +for k, v in env.items(): + m = re.match(r"^MAILBOX_(.+)_PASSWORD$", k, re.I) + if m: + passwords[m.group(1).lower()] = v + continue + target = MAP.get(k) + if not target: + continue + if isinstance(target, tuple): + data[target[0]] = v + passwords[target[1]] = v + else: + data[target] = v + +if passwords: + data["vault_mailcow_mailbox_passwords"] = passwords + +fd, tmp = tempfile.mkstemp(suffix=".yml") +os.close(fd) +with open(tmp, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + +subprocess.run( + [ansible_vault, "encrypt", tmp, "--output", vault_file, + "--vault-password-file", vault_pass, "--encrypt-vault-id", "default"], + check=True, +) +os.remove(tmp) +print(f"Updated {vault_file} from {env_file} ({len(env)} values)") +PY diff --git a/scripts/vault-pull-infra-secrets.sh b/scripts/vault-pull-infra-secrets.sh new file mode 100755 index 0000000..b1ad980 --- /dev/null +++ b/scripts/vault-pull-infra-secrets.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Pull secrets from live hosts into .env, then vault-import-env. +# Does not print secret values. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ENV_FILE="${REPO_ROOT}/.env" + +python3 - "${ENV_FILE}" <<'PY' +import subprocess, sys +from pathlib import Path + +out = Path(sys.argv[1]) +lines = [] + +def sh(cmd): + return subprocess.check_output(cmd, shell=True, text=True).strip() + +def parse_env(text): + d = {} + for line in text.splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, _, v = line.partition("=") + d[k.strip()] = v.strip().strip("'").strip('"') + return d + +# monitoring LXC +try: + raw = sh("ssh -o BatchMode=yes -o ConnectTimeout=8 root@10.0.10.22 'cat /opt/monitoring/.env 2>/dev/null'") + m = parse_env(raw) + if m.get("UMAMI_DB_PASS"): + lines.append(f"UMAMI_DB_PASS={m['UMAMI_DB_PASS']}") + if m.get("UMAMI_APP_SECRET"): + lines.append(f"UMAMI_APP_SECRET={m['UMAMI_APP_SECRET']}") +except Exception as e: + print(f"# skip monitoring: {e}", file=sys.stderr) + +# hermes mattermost +try: + raw = sh("ssh -o BatchMode=yes -o ConnectTimeout=8 ladmin@10.0.10.36 \"sudo cat /home/hermes/.hermes/secrets/mattermost.env 2>/dev/null\"") + h = parse_env(raw) + for k in ("MATTERMOST_URL", "MATTERMOST_TOKEN", "MATTERMOST_ALLOWED_USERS"): + if h.get(k): + lines.append(f"{k}={h[k]}") +except Exception as e: + print(f"# skip hermes: {e}", file=sys.stderr) + +# merge with existing .env (preserve user-filled keys) +existing = {} +if out.exists(): + existing = parse_env(out.read_text()) + +merged = {**existing} +for line in lines: + k, _, v = line.partition("=") + merged[k] = v + +header = """# Auto-merged by scripts/vault-pull-infra-secrets.sh + your edits +# Run: make vault-import-env + +""" +body = "\n".join(f"{k}={v}" for k, v in sorted(merged.items())) + "\n" +out.write_text(header + body) +print(f"Wrote {len(merged)} keys to {out}") +PY + +chmod 600 "${ENV_FILE}" 2>/dev/null || true +"${REPO_ROOT}/scripts/vault-import-env.sh" "${ENV_FILE}"