Infrastructure as Code
Production-ready automation templates for homelab infrastructure. Ansible playbooks, Terraform modules, and shell scripts used to manage the ArgoBox environment.
Docker Stack Deployment
Deploy and configure Docker with common containers across multiple hosts
---
# Ansible Playbook: Deploy Docker Stack
# Usage: ansible-playbook -i inventory docker-stack.yml
- name: Deploy Docker and common containers
hosts: docker_hosts
become: yes
vars:
docker_compose_version: "2.24.0"
containers:
- name: portainer
image: portainer/portainer-ce:latest
ports: ["9443:9443"]
volumes: ["/var/run/docker.sock:/var/run/docker.sock", "portainer_data:/data"]
- name: watchtower
image: containrrr/watchtower:latest
volumes: ["/var/run/docker.sock:/var/run/docker.sock"]
environment:
WATCHTOWER_CLEANUP: "true"
WATCHTOWER_SCHEDULE: "0 0 4 * * *"
tasks:
- name: Install Docker dependencies
apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg
- lsb-release
state: present
update_cache: yes
- name: Add Docker GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker repository
apt_repository:
repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
state: present
- name: Install Docker
apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
state: present
- name: Start and enable Docker
systemd:
name: docker
state: started
enabled: yes
- name: Deploy containers
community.docker.docker_container:
name: "{{ item.name }}"
image: "{{ item.image }}"
ports: "{{ item.ports | default(omit) }}"
volumes: "{{ item.volumes | default(omit) }}"
env: "{{ item.environment | default(omit) }}"
restart_policy: unless-stopped
loop: "{{ containers }}"
Tailscale Mesh VPN Setup
Install and configure Tailscale across all nodes with subnet routing
---
# Ansible Playbook: Tailscale Mesh VPN
# Usage: ansible-playbook -i inventory tailscale-setup.yml -e "tailscale_authkey=tskey-xxx"
- name: Install and configure Tailscale
hosts: all
become: yes
vars:
tailscale_authkey: "{{ lookup('env', 'TAILSCALE_AUTHKEY') }}"
subnet_routers:
- host: alpha-centauri
advertise_routes: "10.42.0.0/24"
- host: titawin-host
advertise_routes: "192.168.20.0/24"
tasks:
- name: Add Tailscale repository key
apt_key:
url: https://pkgs.tailscale.com/stable/ubuntu/jammy.noarmor.gpg
state: present
- name: Add Tailscale repository
apt_repository:
repo: "deb https://pkgs.tailscale.com/stable/ubuntu jammy main"
state: present
- name: Install Tailscale
apt:
name: tailscale
state: present
update_cache: yes
- name: Enable IP forwarding (for subnet routers)
sysctl:
name: "{{ item }}"
value: "1"
sysctl_set: yes
reload: yes
loop:
- net.ipv4.ip_forward
- net.ipv6.conf.all.forwarding
when: inventory_hostname in (subnet_routers | map(attribute='host'))
- name: Start Tailscale service
systemd:
name: tailscaled
state: started
enabled: yes
- name: Authenticate with Tailscale
command: >
tailscale up
--authkey={{ tailscale_authkey }}
--ssh
--accept-routes
{% if inventory_hostname in (subnet_routers | map(attribute='host')) %}
--advertise-routes={{ (subnet_routers | selectattr('host', 'eq', inventory_hostname) | first).advertise_routes }}
{% endif %}
register: tailscale_up
changed_when: "'Success' in tailscale_up.stdout"
- name: Get Tailscale status
command: tailscale status
register: ts_status
changed_when: false
- name: Display Tailscale IP
debug:
msg: "{{ inventory_hostname }} Tailscale IP: {{ ts_status.stdout_lines[0] }}"
Prometheus + Node Exporter + Alertmanager
Full monitoring stack with Prometheus, Node Exporter, and Alertmanager using version-managed downloads, dedicated service users, and systemd units with proper handlers
---
# Ansible Playbook: Prometheus Monitoring Stack
# Deploys Prometheus, Node Exporter, and Alertmanager to monitoring hosts
# Usage: ansible-playbook -i inventory monitoring-stack.yml
# Selective: ansible-playbook -i inventory monitoring-stack.yml --tags prometheus
# ansible-playbook -i inventory monitoring-stack.yml --tags node_exporter
# ansible-playbook -i inventory monitoring-stack.yml --tags alertmanager
- name: Deploy Prometheus monitoring stack
hosts: monitoring_servers
become: yes
vars:
# -----------------------------------------------------------------
# Version variables: update these instead of hardcoding URLs
# -----------------------------------------------------------------
prometheus_version: "2.51.2"
node_exporter_version: "1.8.1"
alertmanager_version: "0.27.0"
# Architecture (amd64, arm64)
arch: "amd64"
# Paths
prometheus_config_dir: /etc/prometheus
prometheus_data_dir: /var/lib/prometheus
alertmanager_config_dir: /etc/alertmanager
alertmanager_data_dir: /var/lib/alertmanager
# Network -- all IPs use 10.42.0.x format
prometheus_listen: "10.42.0.30:9090"
alertmanager_listen: "10.42.0.30:9093"
node_exporter_listen: "0.0.0.0:9100"
# Scrape targets -- add hosts here
scrape_targets:
- "10.42.0.30:9100" # monitoring server itself
- "10.42.0.201:9100" # izar-host
- "10.42.0.199:9100" # altair-link
- "10.42.0.175:9100" # tau-host
- "10.42.0.10:9100" # capella-outpost
# Alertmanager Slack webhook (store in vault for production)
slack_webhook_url: "https://hooks.slack.com/services/REPLACE/WITH/REAL"
slack_channel: "#homelab-alerts"
# Retention
prometheus_retention: "30d"
prometheus_retention_size: "10GB"
# =================================================================
# HANDLERS -- restart services only when config changes
# =================================================================
handlers:
- name: Restart prometheus
ansible.builtin.systemd:
name: prometheus
state: restarted
daemon_reload: yes
tags: [prometheus]
- name: Restart node_exporter
ansible.builtin.systemd:
name: node_exporter
state: restarted
daemon_reload: yes
tags: [node_exporter]
- name: Restart alertmanager
ansible.builtin.systemd:
name: alertmanager
state: restarted
daemon_reload: yes
tags: [alertmanager]
- name: Reload prometheus config
ansible.builtin.systemd:
name: prometheus
state: reloaded
tags: [prometheus]
tasks:
# ===============================================================
# PROMETHEUS
# ===============================================================
- name: Create prometheus system user
ansible.builtin.user:
name: prometheus
system: yes
shell: /usr/sbin/nologin
create_home: no
tags: [prometheus]
- name: Create Prometheus directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
owner: prometheus
group: prometheus
mode: "0755"
loop:
- "{{ prometheus_config_dir }}"
- "{{ prometheus_config_dir }}/rules"
- "{{ prometheus_data_dir }}"
tags: [prometheus]
- name: Download Prometheus (version-managed)
ansible.builtin.get_url:
url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ arch }}.tar.gz"
dest: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
checksum: "sha256:https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/sha256sums.txt"
tags: [prometheus]
- name: Extract Prometheus binaries
ansible.builtin.unarchive:
src: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
dest: /tmp
remote_src: yes
creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ arch }}/prometheus"
tags: [prometheus]
- name: Install Prometheus binaries
ansible.builtin.copy:
src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ arch }}/{{ item }}"
dest: "/usr/local/bin/{{ item }}"
remote_src: yes
owner: root
group: root
mode: "0755"
loop:
- prometheus
- promtool
notify: Restart prometheus
tags: [prometheus]
- name: Deploy Prometheus configuration
ansible.builtin.template:
src: prometheus.yml.j2
dest: "{{ prometheus_config_dir }}/prometheus.yml"
owner: prometheus
group: prometheus
mode: "0644"
validate: "/usr/local/bin/promtool check config %s"
notify: Reload prometheus config
tags: [prometheus]
# --- Template content (prometheus.yml.j2) ---
# global:
# scrape_interval: 15s
# evaluation_interval: 15s
# alerting:
# alertmanagers:
# - static_configs:
# - targets: ['{{ alertmanager_listen }}']
# rule_files:
# - "rules/*.yml"
# scrape_configs:
# - job_name: 'node'
# static_configs:
# - targets: {{ scrape_targets | to_json }}
- name: Deploy Prometheus systemd unit
# Note: for OpenRC hosts, deploy an /etc/init.d/prometheus script
# and /etc/conf.d/prometheus instead of this systemd unit
ansible.builtin.template:
src: prometheus.service.j2
dest: /etc/systemd/system/prometheus.service
mode: "0644"
notify: Restart prometheus
tags: [prometheus]
# --- Unit file content ---
# [Unit]
# Description=Prometheus Monitoring
# After=network-online.target
# Wants=network-online.target
# [Service]
# User=prometheus
# Group=prometheus
# Type=simple
# ExecStart=/usr/local/bin/prometheus \
# --config.file={{ prometheus_config_dir }}/prometheus.yml \
# --storage.tsdb.path={{ prometheus_data_dir }} \
# --storage.tsdb.retention.time={{ prometheus_retention }} \
# --storage.tsdb.retention.size={{ prometheus_retention_size }} \
# --web.listen-address={{ prometheus_listen }} \
# --web.enable-lifecycle
# ExecReload=/bin/kill -HUP $MAINPID
# Restart=on-failure
# RestartSec=5
# LimitNOFILE=65536
# [Install]
# WantedBy=multi-user.target
- name: Enable and start Prometheus
ansible.builtin.systemd:
name: prometheus
state: started
enabled: yes
tags: [prometheus]
# ===============================================================
# NODE EXPORTER
# ===============================================================
- name: Create node_exporter system user
ansible.builtin.user:
name: node_exporter
system: yes
shell: /usr/sbin/nologin
create_home: no
tags: [node_exporter]
- name: Download Node Exporter (version-managed)
ansible.builtin.get_url:
url: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-{{ arch }}.tar.gz"
dest: "/tmp/node_exporter-{{ node_exporter_version }}.tar.gz"
checksum: "sha256:https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/sha256sums.txt"
tags: [node_exporter]
- name: Extract Node Exporter
ansible.builtin.unarchive:
src: "/tmp/node_exporter-{{ node_exporter_version }}.tar.gz"
dest: /tmp
remote_src: yes
creates: "/tmp/node_exporter-{{ node_exporter_version }}.linux-{{ arch }}/node_exporter"
tags: [node_exporter]
- name: Install Node Exporter binary
ansible.builtin.copy:
src: "/tmp/node_exporter-{{ node_exporter_version }}.linux-{{ arch }}/node_exporter"
dest: /usr/local/bin/node_exporter
remote_src: yes
owner: root
group: root
mode: "0755"
notify: Restart node_exporter
tags: [node_exporter]
- name: Deploy Node Exporter systemd unit
# Note: for OpenRC, create /etc/init.d/node_exporter with
# start-stop-daemon --start --user node_exporter --exec /usr/local/bin/node_exporter
ansible.builtin.copy:
dest: /etc/systemd/system/node_exporter.service
mode: "0644"
content: |
[Unit]
Description=Prometheus Node Exporter
After=network-online.target
Wants=network-online.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter \
--web.listen-address={{ node_exporter_listen }} \
--collector.systemd \
--collector.processes \
--collector.filesystem.mount-points-exclude="^/(sys|proc|dev|host|etc)($$|/)" \
--collector.netclass.ignored-devices="^(veth|docker|br-).*"
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
notify: Restart node_exporter
tags: [node_exporter]
- name: Enable and start Node Exporter
ansible.builtin.systemd:
name: node_exporter
state: started
enabled: yes
tags: [node_exporter]
# ===============================================================
# ALERTMANAGER
# ===============================================================
- name: Create alertmanager system user
ansible.builtin.user:
name: alertmanager
system: yes
shell: /usr/sbin/nologin
create_home: no
tags: [alertmanager]
- name: Create Alertmanager directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
owner: alertmanager
group: alertmanager
mode: "0755"
loop:
- "{{ alertmanager_config_dir }}"
- "{{ alertmanager_data_dir }}"
tags: [alertmanager]
- name: Download Alertmanager (version-managed)
ansible.builtin.get_url:
url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-{{ arch }}.tar.gz"
dest: "/tmp/alertmanager-{{ alertmanager_version }}.tar.gz"
checksum: "sha256:https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/sha256sums.txt"
tags: [alertmanager]
- name: Extract Alertmanager
ansible.builtin.unarchive:
src: "/tmp/alertmanager-{{ alertmanager_version }}.tar.gz"
dest: /tmp
remote_src: yes
creates: "/tmp/alertmanager-{{ alertmanager_version }}.linux-{{ arch }}/alertmanager"
tags: [alertmanager]
- name: Install Alertmanager binaries
ansible.builtin.copy:
src: "/tmp/alertmanager-{{ alertmanager_version }}.linux-{{ arch }}/{{ item }}"
dest: "/usr/local/bin/{{ item }}"
remote_src: yes
owner: root
group: root
mode: "0755"
loop:
- alertmanager
- amtool
notify: Restart alertmanager
tags: [alertmanager]
- name: Deploy Alertmanager configuration
ansible.builtin.template:
src: alertmanager.yml.j2
dest: "{{ alertmanager_config_dir }}/alertmanager.yml"
owner: alertmanager
group: alertmanager
mode: "0644"
validate: "/usr/local/bin/amtool check-config %s"
notify: Restart alertmanager
tags: [alertmanager]
# --- Template content (alertmanager.yml.j2) ---
# global:
# resolve_timeout: 5m
# route:
# group_by: ['alertname', 'job']
# group_wait: 30s
# group_interval: 5m
# repeat_interval: 4h
# receiver: 'slack-notifications'
# receivers:
# - name: 'slack-notifications'
# slack_configs:
# - api_url: '{{ slack_webhook_url }}'
# channel: '{{ slack_channel }}'
# send_resolved: true
# title: '[{{ "{{" }} .Status | toUpper {{ "}}" }}] {{ "{{" }} .CommonLabels.alertname {{ "}}" }}'
# text: '{{ "{{" }} range .Alerts {{ "}}" }}*{{ "{{" }} .Annotations.summary {{ "}}" }}*{{ "{{" }} end {{ "}}" }}'
- name: Deploy Alertmanager systemd unit
ansible.builtin.copy:
dest: /etc/systemd/system/alertmanager.service
mode: "0644"
content: |
[Unit]
Description=Prometheus Alertmanager
After=network-online.target
Wants=network-online.target
[Service]
User=alertmanager
Group=alertmanager
Type=simple
ExecStart=/usr/local/bin/alertmanager \
--config.file={{ alertmanager_config_dir }}/alertmanager.yml \
--storage.path={{ alertmanager_data_dir }} \
--web.listen-address={{ alertmanager_listen }}
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
notify: Restart alertmanager
tags: [alertmanager]
- name: Enable and start Alertmanager
ansible.builtin.systemd:
name: alertmanager
state: started
enabled: yes
tags: [alertmanager]
Proxmox VM Provisioning
Terraform module to create VMs on Proxmox with cloud-init
# Terraform: Proxmox VM Module
# Creates VMs with cloud-init configuration
terraform {
required_providers {
proxmox = {
source = "Telmate/proxmox"
version = "~> 2.9"
}
}
}
variable "proxmox_host" {
description = "Proxmox host IP"
default = "10.42.0.201"
}
variable "vm_name" {
description = "Name of the VM"
type = string
}
variable "target_node" {
description = "Proxmox node to deploy on"
default = "icarus"
}
variable "cores" {
description = "Number of CPU cores"
default = 4
}
variable "memory" {
description = "RAM in MB"
default = 4096
}
variable "disk_size" {
description = "Boot disk size"
default = "32G"
}
variable "ip_address" {
description = "Static IP address"
type = string
}
variable "gateway" {
description = "Network gateway"
default = "10.42.0.1"
}
variable "ssh_keys" {
description = "SSH public keys for cloud-init"
type = string
}
resource "proxmox_vm_qemu" "vm" {
name = var.vm_name
target_node = var.target_node
clone = "ubuntu-cloud-template"
cores = var.cores
sockets = 1
memory = var.memory
agent = 1 # Enable QEMU guest agent
disk {
storage = "local-zfs"
size = var.disk_size
type = "scsi"
}
network {
model = "virtio"
bridge = "vmbr0"
}
# Cloud-init configuration
os_type = "cloud-init"
ipconfig0 = "ip=${var.ip_address}/24,gw=${var.gateway}"
ciuser = "commander"
sshkeys = var.ssh_keys
lifecycle {
ignore_changes = [
network,
]
}
tags = "terraform,${var.vm_name}"
}
output "vm_ip" {
value = var.ip_address
}
output "vm_id" {
value = proxmox_vm_qemu.vm.vmid
}
Cloudflare Tunnel Configuration
Terraform module for Cloudflare Tunnel and DNS records
# Terraform: Cloudflare Tunnel
# Manages tunnel configuration and DNS records
terraform {
required_providers {
cloudflare = {
source = "cloudflare/cloudflare"
version = "~> 4.0"
}
}
}
variable "cloudflare_account_id" {
description = "Cloudflare account ID"
type = string
sensitive = true
}
variable "cloudflare_zone_id" {
description = "Cloudflare zone ID for your domain"
type = string
}
variable "domain" {
description = "Base domain name"
default = "argobox.com"
}
variable "tunnel_secret" {
description = "Tunnel secret (base64)"
type = string
sensitive = true
}
variable "services" {
description = "Services to expose through tunnel"
type = list(object({
subdomain = string
service = string
port = number
}))
default = [
{ subdomain = "git", service = "localhost", port = 3000 },
{ subdomain = "ai", service = "localhost", port = 30000 },
{ subdomain = "vault", service = "localhost", port = 31745 },
]
}
# Create the tunnel
resource "cloudflare_tunnel" "homelab" {
account_id = var.cloudflare_account_id
name = "homelab-tunnel"
secret = var.tunnel_secret
}
# Configure tunnel routes
resource "cloudflare_tunnel_config" "homelab" {
account_id = var.cloudflare_account_id
tunnel_id = cloudflare_tunnel.homelab.id
config {
dynamic "ingress_rule" {
for_each = var.services
content {
hostname = "${ingress_rule.value.subdomain}.${var.domain}"
service = "http://${ingress_rule.value.service}:${ingress_rule.value.port}"
}
}
# Catch-all rule (required)
ingress_rule {
service = "http_status:404"
}
}
}
# Create DNS records pointing to tunnel
resource "cloudflare_record" "tunnel_dns" {
for_each = { for s in var.services : s.subdomain => s }
zone_id = var.cloudflare_zone_id
name = each.value.subdomain
value = "${cloudflare_tunnel.homelab.id}.cfargotunnel.com"
type = "CNAME"
proxied = true
}
output "tunnel_id" {
value = cloudflare_tunnel.homelab.id
}
output "tunnel_token" {
value = cloudflare_tunnel.homelab.tunnel_token
sensitive = true
}
Restic + Rsync Hybrid Backup
Versioned backups with Restic to SFTP/S3 remote targets, fast local snapshots via rsync, pre/post hooks for containers and databases, retention policy enforcement, and integrity checking
#!/bin/bash
# ===================================================================
# Restic + Rsync Hybrid Backup Script
# -------------------------------------------------------------------
# Restic: versioned, deduplicated backups to a remote SFTP or S3 repo
# Rsync: fast local snapshots for quick restores
#
# Cron example (daily at 2 AM):
# 0 2 * * * /usr/local/bin/backup.sh >> /var/log/backup.log 2>&1
# ===================================================================
set -Euo pipefail
# ---------------------------------------------------------------
# Lock file -- prevent concurrent runs
# ---------------------------------------------------------------
LOCKFILE="/var/run/backup-homelab.lock"
exec 200>"${LOCKFILE}"
if ! flock -n 200; then
echo "[ERROR] Another backup is already running (lockfile: ${LOCKFILE}). Exiting."
exit 1
fi
# ---------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------
HOSTNAME_SHORT=$(hostname -s)
LOG_FILE="/var/log/backup.log"
NTFY_URL="https://ntfy.sh/argobox-backups"
# Restic repository -- SFTP target on NAS at 10.42.0.50
# For S3: export RESTIC_REPOSITORY="s3:https://s3.example.com/backups"
export RESTIC_REPOSITORY="sftp:[email protected]:/volume1/backups/restic/${HOSTNAME_SHORT}"
export RESTIC_PASSWORD_FILE="/etc/backup/restic-password"
# Rsync local snapshot destination
LOCAL_SNAPSHOT_DIR="/mnt/backups/${HOSTNAME_SHORT}/latest"
# Directories to back up (use arrays -- no eval needed)
BACKUP_PATHS=(
"/etc"
"/home/commander"
"/opt/docker"
"/opt/stacks"
"/var/lib/docker/volumes"
)
# Paths to exclude
EXCLUDE_PATTERNS=(
"*.tmp"
"*.cache"
"**/node_modules"
"**/.git"
"**/__pycache__"
"**/*.log"
"**/lost+found"
)
# Containers to stop before backup (space-separated)
STOP_CONTAINERS=("postgres" "mariadb")
# Databases to dump
POSTGRES_CONTAINER="postgres"
POSTGRES_DATABASES=("gitea" "nextcloud")
DB_DUMP_DIR="/tmp/backup-db-dumps"
# Retention policy
KEEP_DAILY=7
KEEP_WEEKLY=4
KEEP_MONTHLY=6
KEEP_YEARLY=1
# ---------------------------------------------------------------
# Logging helper
# ---------------------------------------------------------------
log() {
local level="$1"; shift
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [${level}] $*" | tee -a "${LOG_FILE}"
}
# ---------------------------------------------------------------
# Notification helper -- send status to ntfy on completion
# ---------------------------------------------------------------
notify() {
local title="$1"
local message="$2"
local priority="${3:-default}"
curl -s -o /dev/null \
-H "Title: ${title}" \
-H "Priority: ${priority}" \
-d "${message}" \
"${NTFY_URL}" || true
}
# ---------------------------------------------------------------
# Error handler
# ---------------------------------------------------------------
BACKUP_STATUS="FAILED"
cleanup() {
local exit_code=$?
# Always restart containers, even on failure
post_backup_hooks || true
if [[ "${BACKUP_STATUS}" != "OK" ]]; then
log "ERROR" "Backup failed with exit code ${exit_code}"
notify "Backup FAILED [${HOSTNAME_SHORT}]" \
"Backup failed at $(date). Check ${LOG_FILE} for details." \
"urgent"
fi
rm -rf "${DB_DUMP_DIR}" 2>/dev/null || true
}
trap cleanup EXIT
# ---------------------------------------------------------------
# Pre-backup hooks: stop containers, dump databases
# ---------------------------------------------------------------
pre_backup_hooks() {
log "INFO" "Running pre-backup hooks"
# Stop listed containers gracefully
for ctr in "${STOP_CONTAINERS[@]}"; do
if docker ps --format '{{.Names}}' | grep -q "^${ctr}$"; then
log "INFO" "Stopping container: ${ctr}"
docker stop "${ctr}" --time 30
fi
done
# Dump PostgreSQL databases
mkdir -p "${DB_DUMP_DIR}"
if docker ps -a --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then
for db in "${POSTGRES_DATABASES[@]}"; do
log "INFO" "Dumping PostgreSQL database: ${db}"
docker exec "${POSTGRES_CONTAINER}" \
pg_dump -U postgres -Fc "${db}" > "${DB_DUMP_DIR}/${db}.dump" \
|| log "WARN" "Failed to dump database: ${db}"
done
fi
}
# ---------------------------------------------------------------
# Post-backup hooks: restart containers, verify
# ---------------------------------------------------------------
post_backup_hooks() {
log "INFO" "Running post-backup hooks"
for ctr in "${STOP_CONTAINERS[@]}"; do
if docker ps -a --format '{{.Names}}' | grep -q "^${ctr}$"; then
log "INFO" "Starting container: ${ctr}"
docker start "${ctr}"
fi
done
}
# ---------------------------------------------------------------
# Build exclude arguments for restic and rsync
# ---------------------------------------------------------------
RESTIC_EXCLUDES=()
RSYNC_EXCLUDES=()
for pattern in "${EXCLUDE_PATTERNS[@]}"; do
RESTIC_EXCLUDES+=("--exclude" "${pattern}")
RSYNC_EXCLUDES+=("--exclude" "${pattern}")
done
# ---------------------------------------------------------------
# Main backup sequence
# ---------------------------------------------------------------
log "INFO" "====== Backup started for ${HOSTNAME_SHORT} ======"
# 1. Pre-backup hooks (stop containers, dump DBs)
pre_backup_hooks
# 2. Include database dumps in backup paths
if [[ -d "${DB_DUMP_DIR}" ]] && ls "${DB_DUMP_DIR}"/*.dump &>/dev/null; then
BACKUP_PATHS+=("${DB_DUMP_DIR}")
fi
# 3. Initialize restic repo if it does not exist
if ! restic snapshots --quiet &>/dev/null; then
log "INFO" "Initializing new restic repository"
restic init
fi
# 4. Restic backup -- versioned, deduplicated
log "INFO" "Starting restic backup to ${RESTIC_REPOSITORY}"
restic backup \
"${RESTIC_EXCLUDES[@]}" \
--tag "${HOSTNAME_SHORT}" \
--tag "scheduled" \
--verbose \
"${BACKUP_PATHS[@]}" 2>&1 | tee -a "${LOG_FILE}"
# 5. Rsync local snapshot -- fast local copy for quick restores
log "INFO" "Rsync local snapshot to ${LOCAL_SNAPSHOT_DIR}"
mkdir -p "${LOCAL_SNAPSHOT_DIR}"
for src in "${BACKUP_PATHS[@]}"; do
if [[ -d "${src}" ]]; then
dest_subdir="${LOCAL_SNAPSHOT_DIR}${src}"
mkdir -p "${dest_subdir}"
rsync -a --delete "${RSYNC_EXCLUDES[@]}" "${src}/" "${dest_subdir}/" \
2>&1 | tee -a "${LOG_FILE}"
else
log "WARN" "Source path does not exist, skipping: ${src}"
fi
done
# 6. Post-backup hooks (restart containers)
post_backup_hooks
# 7. Enforce retention policy
log "INFO" "Enforcing retention policy"
restic forget \
--keep-daily ${KEEP_DAILY} \
--keep-weekly ${KEEP_WEEKLY} \
--keep-monthly ${KEEP_MONTHLY} \
--keep-yearly ${KEEP_YEARLY} \
--prune \
--tag "${HOSTNAME_SHORT}" 2>&1 | tee -a "${LOG_FILE}"
# 8. Integrity check (run weekly -- check day-of-week)
DOW=$(date +%u)
if [[ "${DOW}" -eq 7 ]]; then
log "INFO" "Running weekly integrity check"
restic check --read-data-subset=5% 2>&1 | tee -a "${LOG_FILE}"
fi
# 9. Report
SNAPSHOT_COUNT=$(restic snapshots --tag "${HOSTNAME_SHORT}" --json | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "?")
LOCAL_SIZE=$(du -sh "${LOCAL_SNAPSHOT_DIR}" 2>/dev/null | cut -f1 || echo "?")
log "INFO" "Backup complete. Restic snapshots: ${SNAPSHOT_COUNT}. Local snapshot size: ${LOCAL_SIZE}"
BACKUP_STATUS="OK"
notify "Backup OK [${HOSTNAME_SHORT}]" \
"Restic snapshots: ${SNAPSHOT_COUNT}. Local: ${LOCAL_SIZE}. Duration: ${SECONDS}s."
log "INFO" "====== Backup finished in ${SECONDS}s ======"
Docker Maintenance Script
Comprehensive Docker maintenance with selective image cleanup (7-day retention), safe volume pruning, builder cache limits, container health checks, log rotation verification, and ntfy notifications
#!/bin/bash
# ===================================================================
# Docker Maintenance Script
# -------------------------------------------------------------------
# Performs selective cleanup of Docker resources with safety checks.
# Designed to run weekly via cron or systemd timer.
#
# Cron: 0 3 * * 0 /usr/local/bin/docker-maintenance.sh
# ===================================================================
set -Euo pipefail
# ---------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------
LOG_FILE="/var/log/docker-maintenance.log"
NTFY_URL="https://ntfy.sh/argobox-docker"
IMAGE_RETAIN_DAYS=7 # Keep images used within this window
BUILDER_CACHE_LIMIT="5GB" # Max builder cache to retain
LOGROTATE_CONF="/etc/logrotate.d/docker-containers"
# ---------------------------------------------------------------
# Logging
# ---------------------------------------------------------------
ERRORS=0
log() {
local level="$1"; shift
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [${level}] $*" | tee -a "${LOG_FILE}"
}
# ---------------------------------------------------------------
# Error handling -- count errors but continue
# ---------------------------------------------------------------
on_error() {
ERRORS=$((ERRORS + 1))
log "ERROR" "Command failed on line ${BASH_LINENO[0]}"
}
trap on_error ERR
# ---------------------------------------------------------------
# Notification
# ---------------------------------------------------------------
notify() {
local title="$1"; local body="$2"; local priority="${3:-default}"
curl -s -o /dev/null \
-H "Title: ${title}" \
-H "Priority: ${priority}" \
-d "${body}" \
"${NTFY_URL}" || true
}
# ===================================================================
# Phase 1: Pre-cleanup disk usage report
# ===================================================================
log "INFO" "====== Docker Maintenance Started ======"
log "INFO" "--- Disk usage BEFORE cleanup ---"
docker system df -v 2>&1 | tee -a "${LOG_FILE}"
DISK_BEFORE=$(docker system df --format '{{.Size}}' | head -1)
# ===================================================================
# Phase 2: Container health check
# ===================================================================
log "INFO" "--- Container health check ---"
UNHEALTHY=$(docker ps --filter "health=unhealthy" --format '{{.Names}} ({{.Image}}) status={{.Status}}' 2>/dev/null || true)
if [[ -n "${UNHEALTHY}" ]]; then
log "WARN" "Unhealthy containers detected:"
while IFS= read -r line; do
log "WARN" " ${line}"
done <<< "${UNHEALTHY}"
else
log "INFO" "All running containers are healthy"
fi
# Report containers in restart loops (restarting status)
RESTARTING=$(docker ps --filter "status=restarting" --format '{{.Names}}' 2>/dev/null || true)
if [[ -n "${RESTARTING}" ]]; then
log "WARN" "Containers in restart loop: ${RESTARTING}"
fi
# ===================================================================
# Phase 3: Remove exited and dead containers
# ===================================================================
log "INFO" "--- Removing exited/dead containers ---"
DEAD_CONTAINERS=$(docker ps -aq --filter "status=exited" --filter "status=dead" 2>/dev/null || true)
if [[ -n "${DEAD_CONTAINERS}" ]]; then
DEAD_COUNT=$(echo "${DEAD_CONTAINERS}" | wc -w)
log "INFO" "Removing ${DEAD_COUNT} exited/dead containers"
echo "${DEAD_CONTAINERS}" | xargs docker rm 2>&1 | tee -a "${LOG_FILE}"
else
log "INFO" "No exited/dead containers to remove"
fi
# ===================================================================
# Phase 4: Selective image cleanup
# Keep images used by running containers and images pulled/used
# within the last IMAGE_RETAIN_DAYS days. Remove the rest.
# ===================================================================
log "INFO" "--- Selective image cleanup (retain ${IMAGE_RETAIN_DAYS}-day window) ---"
# Collect images used by running containers (never remove these)
RUNNING_IMAGES=$(docker ps --format '{{.Image}}' | sort -u)
# Collect all image IDs
ALL_IMAGES=$(docker images --format '{{.ID}}|{{.Repository}}:{{.Tag}}|{{.CreatedAt}}' 2>/dev/null || true)
REMOVED_IMAGES=0
CUTOFF_TS=$(date -d "-${IMAGE_RETAIN_DAYS} days" +%s 2>/dev/null || date -v-${IMAGE_RETAIN_DAYS}d +%s)
while IFS='|' read -r img_id img_name img_created; do
[[ -z "${img_id}" ]] && continue
# Skip images used by running containers
if echo "${RUNNING_IMAGES}" | grep -q "${img_name}"; then
continue
fi
# Skip images with <none> tag only if they are dangling (handled separately)
# Check creation date
img_ts=$(date -d "${img_created}" +%s 2>/dev/null || echo "0")
if [[ "${img_ts}" -lt "${CUTOFF_TS}" ]]; then
log "INFO" "Removing old image: ${img_name} (created: ${img_created})"
docker rmi "${img_id}" 2>/dev/null && REMOVED_IMAGES=$((REMOVED_IMAGES + 1)) || true
fi
done <<< "${ALL_IMAGES}"
log "INFO" "Removed ${REMOVED_IMAGES} images older than ${IMAGE_RETAIN_DAYS} days"
# Also remove dangling (untagged) images
DANGLING_COUNT=$(docker images -f "dangling=true" -q | wc -l)
if [[ "${DANGLING_COUNT}" -gt 0 ]]; then
log "INFO" "Removing ${DANGLING_COUNT} dangling images"
docker image prune -f 2>&1 | tee -a "${LOG_FILE}"
fi
# ===================================================================
# Phase 5: Volume cleanup (safe -- only anonymous volumes)
# ===================================================================
log "INFO" "--- Volume cleanup ---"
# List named volumes that are NOT in use (warn but do not remove)
UNUSED_NAMED=$(docker volume ls --filter "dangling=true" --format '{{.Name}}' | grep -v '^[0-9a-f]\{64\}$' || true)
if [[ -n "${UNUSED_NAMED}" ]]; then
log "WARN" "Unused NAMED volumes detected (not removing -- review manually):"
while IFS= read -r vol; do
log "WARN" " ${vol}"
done <<< "${UNUSED_NAMED}"
fi
# Only prune anonymous volumes (64-char hex names)
ANON_VOLUMES=$(docker volume ls --filter "dangling=true" --format '{{.Name}}' | grep '^[0-9a-f]\{64\}$' || true)
if [[ -n "${ANON_VOLUMES}" ]]; then
ANON_COUNT=$(echo "${ANON_VOLUMES}" | wc -l)
log "INFO" "Removing ${ANON_COUNT} anonymous dangling volumes"
echo "${ANON_VOLUMES}" | xargs -r docker volume rm 2>&1 | tee -a "${LOG_FILE}"
else
log "INFO" "No anonymous dangling volumes"
fi
# ===================================================================
# Phase 6: Network cleanup
# ===================================================================
log "INFO" "--- Network cleanup ---"
docker network prune -f 2>&1 | tee -a "${LOG_FILE}"
# ===================================================================
# Phase 7: Builder cache pruning with size limit
# ===================================================================
log "INFO" "--- Builder cache prune (keep ${BUILDER_CACHE_LIMIT}) ---"
docker builder prune -f --keep-storage "${BUILDER_CACHE_LIMIT}" 2>&1 | tee -a "${LOG_FILE}"
# ===================================================================
# Phase 8: Container log rotation check
# ===================================================================
log "INFO" "--- Log rotation verification ---"
if [[ -f "${LOGROTATE_CONF}" ]]; then
log "INFO" "Logrotate config exists at ${LOGROTATE_CONF}"
# Dry-run to verify config is valid
if logrotate -d "${LOGROTATE_CONF}" &>/dev/null; then
log "INFO" "Logrotate config is valid"
else
log "WARN" "Logrotate config has issues -- check ${LOGROTATE_CONF}"
fi
else
log "WARN" "No logrotate config found at ${LOGROTATE_CONF}"
log "WARN" "Container logs may grow unbounded. Create a logrotate config or"
log "WARN" "set log-opts in /etc/docker/daemon.json: max-size=10m, max-file=3"
fi
# Check for oversized container logs (>100MB)
LARGE_LOGS=$(find /var/lib/docker/containers/ -name "*-json.log" -size +100M 2>/dev/null || true)
if [[ -n "${LARGE_LOGS}" ]]; then
log "WARN" "Container logs exceeding 100MB:"
while IFS= read -r logfile; do
SIZE=$(du -sh "${logfile}" | cut -f1)
CONTAINER_ID=$(basename "$(dirname "${logfile}")")
CONTAINER_NAME=$(docker inspect --format '{{.Name}}' "${CONTAINER_ID}" 2>/dev/null | sed 's|^/||' || echo "unknown")
log "WARN" " ${CONTAINER_NAME}: ${SIZE} (${logfile})"
done <<< "${LARGE_LOGS}"
fi
# ===================================================================
# Phase 9: Post-cleanup disk usage report
# ===================================================================
log "INFO" "--- Disk usage AFTER cleanup ---"
docker system df -v 2>&1 | tee -a "${LOG_FILE}"
DISK_AFTER=$(docker system df --format '{{.Size}}' | head -1)
# ===================================================================
# Phase 10: Summary and notification
# ===================================================================
SUMMARY="Docker maintenance complete on $(hostname -s).
Before: ${DISK_BEFORE}. After: ${DISK_AFTER}.
Images removed: ${REMOVED_IMAGES}. Errors: ${ERRORS}."
if [[ -n "${UNHEALTHY}" ]]; then
SUMMARY+=$'\nUnhealthy containers detected -- check logs.'
fi
log "INFO" "${SUMMARY}"
if [[ "${ERRORS}" -gt 0 ]]; then
notify "Docker Maintenance [$(hostname -s)] - ${ERRORS} errors" \
"${SUMMARY}" "high"
exit 1
else
notify "Docker Maintenance [$(hostname -s)] - OK" "${SUMMARY}"
fi
log "INFO" "====== Docker Maintenance Finished ======"
OpenRC Service Template
Template for creating OpenRC init scripts (Gentoo/Alpine)
#!/sbin/openrc-run
# OpenRC Service Template
# Place in /etc/init.d/ and chmod +x
# Enable with: rc-update add servicename default
name="myservice"
description="My Custom Service"
# Service configuration
command="/usr/local/bin/myservice"
command_args="--config /etc/myservice/config.yaml"
command_user="commander"
command_group="commander"
command_background=true
# PID file location
pidfile="/run/${RC_SVCNAME}.pid"
# Log configuration
output_log="/var/log/${RC_SVCNAME}.log"
error_log="/var/log/${RC_SVCNAME}.err"
# Dependencies
depend() {
need net
after firewall
use dns logger
}
# Pre-start checks
start_pre() {
checkpath --directory --owner ${command_user}:${command_group} --mode 0755 /var/lib/myservice
checkpath --file --owner ${command_user}:${command_group} --mode 0640 /etc/myservice/config.yaml
}
# Custom start function (optional)
start() {
ebegin "Starting ${name}"
start-stop-daemon --start \
--exec ${command} \
--user ${command_user} \
--group ${command_group} \
--background \
--make-pidfile \
--pidfile ${pidfile} \
--stdout ${output_log} \
--stderr ${error_log} \
-- ${command_args}
eend $?
}
# Custom stop function (optional)
stop() {
ebegin "Stopping ${name}"
start-stop-daemon --stop \
--exec ${command} \
--pidfile ${pidfile}
eend $?
}
# Status check
status() {
if [ -f "${pidfile}" ]; then
if kill -0 $(cat ${pidfile}) 2>/dev/null; then
einfo "${name} is running (PID: $(cat ${pidfile}))"
return 0
fi
fi
einfo "${name} is not running"
return 3
}
Systemd Timer Template
Systemd service and timer for scheduled tasks
# Systemd Timer: backup.timer
# Place in /etc/systemd/system/
# Enable with: systemctl enable --now backup.timer
# === backup.service ===
# [Unit]
# Description=Automated Backup Service
# After=network-online.target
# Wants=network-online.target
#
# [Service]
# Type=oneshot
# ExecStart=/usr/local/bin/backup.sh
# User=root
# StandardOutput=journal
# StandardError=journal
#
# [Install]
# WantedBy=multi-user.target
# === backup.timer ===
[Unit]
Description=Run backup daily at 3 AM
[Timer]
# Run at 3:00 AM every day
OnCalendar=*-*-* 03:00:00
# Add randomized delay up to 15 minutes
RandomizedDelaySec=900
# Run immediately if we missed the last scheduled time
Persistent=true
# Don't run if system was just booted
OnBootSec=5min
[Install]
WantedBy=timers.target
# === Useful timer expressions ===
# OnCalendar=hourly # Every hour
# OnCalendar=daily # Every day at midnight
# OnCalendar=weekly # Every Monday at midnight
# OnCalendar=*-*-* 04:00:00 # Every day at 4 AM
# OnCalendar=Mon *-*-* 02:00 # Every Monday at 2 AM
# OnCalendar=*-*-01 00:00:00 # First of every month
# === Commands ===
# systemctl list-timers # List all timers
# systemctl status backup.timer # Check timer status
# systemctl start backup.service # Run manually
# journalctl -u backup.service # View logs
ZFS Snapshot Management
Automated ZFS snapshots with retention policy
#!/bin/bash
# ZFS Snapshot Management Script
# Creates snapshots with automatic rotation
set -euo pipefail
# Configuration
POOL="tank-storage"
SNAPSHOT_PREFIX="auto"
HOURLY_KEEP=24
DAILY_KEEP=30
WEEKLY_KEEP=12
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# Create snapshot
create_snapshot() {
local snap_type="$1"
local timestamp=$(date +%Y%m%d_%H%M%S)
local snap_name="${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_${timestamp}"
log "Creating snapshot: ${snap_name}"
zfs snapshot -r "${snap_name}"
}
# List snapshots by type
list_snapshots() {
local snap_type="$1"
zfs list -t snapshot -o name -s creation | grep "${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_" || true
}
# Delete old snapshots
cleanup_snapshots() {
local snap_type="$1"
local keep="$2"
local snapshots=($(list_snapshots "${snap_type}"))
local count=${#snapshots[@]}
if (( count > keep )); then
local to_delete=$((count - keep))
log "Cleaning up ${to_delete} old ${snap_type} snapshots"
for ((i=0; i<to_delete; i++)); do
log "Deleting: ${snapshots[i]}"
zfs destroy -r "${snapshots[i]}"
done
else
log "No ${snap_type} snapshots to clean (have ${count}, keep ${keep})"
fi
}
# Main logic based on argument
case "${1:-hourly}" in
hourly)
create_snapshot "hourly"
cleanup_snapshots "hourly" $HOURLY_KEEP
;;
daily)
create_snapshot "daily"
cleanup_snapshots "daily" $DAILY_KEEP
;;
weekly)
create_snapshot "weekly"
cleanup_snapshots "weekly" $WEEKLY_KEEP
;;
list)
echo "=== Hourly Snapshots ==="
list_snapshots "hourly"
echo
echo "=== Daily Snapshots ==="
list_snapshots "daily"
echo
echo "=== Weekly Snapshots ==="
list_snapshots "weekly"
;;
status)
echo "=== ZFS Pool Status ==="
zpool status $POOL
echo
echo "=== Snapshot Counts ==="
echo "Hourly: $(list_snapshots hourly | wc -l) / $HOURLY_KEEP"
echo "Daily: $(list_snapshots daily | wc -l) / $DAILY_KEEP"
echo "Weekly: $(list_snapshots weekly | wc -l) / $WEEKLY_KEEP"
;;
*)
echo "Usage: $0 {hourly|daily|weekly|list|status}"
exit 1
;;
esac
log "Done"
Ansible Role Structure
Complete Ansible role for Docker host provisioning showing proper directory layout with tasks, handlers, defaults, vars, meta, and templates. Initialize with: ansible-galaxy init roles/docker-host
# ===========================================================
# Ansible Role: docker-host
# ===========================================================
# Directory structure:
# roles/docker-host/
# ├── tasks/main.yml # Core task logic
# ├── handlers/main.yml # Service restart handlers
# ├── templates/ # Jinja2 templates
# │ └── daemon.json.j2 # Docker daemon config
# ├── defaults/main.yml # Default variables (overridable)
# ├── vars/main.yml # Role-internal variables
# └── meta/main.yml # Galaxy metadata + dependencies
#
# Create scaffold: ansible-galaxy init roles/docker-host
# Usage in playbook:
# - hosts: docker_hosts
# roles:
# - role: docker-host
# docker_log_max_size: "50m"
# ===========================================================
# ===================== defaults/main.yml ====================
# These variables can be overridden in playbooks, group_vars,
# or host_vars. They define sane defaults for all Docker hosts.
# ============================================================
# docker_edition: "ce"
# docker_version: "5:26.1.4-1~ubuntu.22.04~jammy"
# docker_compose_version: "2.27.0"
#
# # Daemon configuration defaults
# docker_log_driver: "json-file"
# docker_log_max_size: "10m"
# docker_log_max_file: "3"
# docker_storage_driver: "overlay2"
# docker_live_restore: true
# docker_default_address_pool_base: "172.17.0.0/12"
# docker_default_address_pool_size: 24
#
# # User to add to the docker group
# docker_users:
# - commander
#
# # Data root (set to a dedicated disk/partition if available)
# docker_data_root: "/var/lib/docker"
# ===================== vars/main.yml ========================
# Internal variables -- not intended to be overridden.
# ============================================================
# docker_apt_key_url: "https://download.docker.com/linux/ubuntu/gpg"
# docker_apt_repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
# docker_packages:
# - "docker-{{ docker_edition }}"
# - "docker-{{ docker_edition }}-cli"
# - "containerd.io"
# - "docker-buildx-plugin"
# - "docker-compose-plugin"
# ===================== meta/main.yml ========================
# Galaxy metadata and role dependencies.
# ============================================================
# galaxy_info:
# author: commander
# description: "Provision and configure Docker CE hosts"
# company: ArgoBox Homelab
# license: MIT
# min_ansible_version: "2.15"
# platforms:
# - name: Ubuntu
# versions:
# - jammy
# - noble
# - name: Debian
# versions:
# - bookworm
# galaxy_tags:
# - docker
# - containers
# - homelab
#
# dependencies:
# # Run common role first (sets up users, SSH keys, base packages)
# - role: common
# ===================== handlers/main.yml ====================
# Handlers are triggered by 'notify' in tasks. They run once
# at the end of the play, even if notified multiple times.
# ============================================================
---
# handlers/main.yml
- name: Restart docker
ansible.builtin.service:
name: docker
state: restarted
listen: "restart docker"
- name: Reload docker daemon
ansible.builtin.systemd:
name: docker
state: reloaded
daemon_reload: yes
listen: "reload docker"
# ===================== tasks/main.yml =======================
# Main task file -- installs Docker, deploys daemon config,
# adds users to docker group, starts the service.
# ============================================================
# tasks/main.yml
- name: Install prerequisites
ansible.builtin.apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg
- lsb-release
- python3-docker
state: present
update_cache: yes
tags: [docker, packages]
- name: Add Docker GPG key
ansible.builtin.apt_key:
url: "{{ docker_apt_key_url }}"
state: present
tags: [docker, repo]
- name: Add Docker APT repository
ansible.builtin.apt_repository:
repo: "{{ docker_apt_repo }}"
state: present
filename: docker
tags: [docker, repo]
- name: Install Docker packages
ansible.builtin.apt:
name: "{{ docker_packages }}"
state: present
update_cache: yes
tags: [docker, packages]
- name: Create Docker config directory
ansible.builtin.file:
path: /etc/docker
state: directory
owner: root
group: root
mode: "0755"
tags: [docker, config]
- name: Deploy daemon.json from template
ansible.builtin.template:
src: daemon.json.j2
dest: /etc/docker/daemon.json
owner: root
group: root
mode: "0644"
notify: restart docker
tags: [docker, config]
# --- Template: templates/daemon.json.j2 ---
# {
# "log-driver": "{{ docker_log_driver }}",
# "log-opts": {
# "max-size": "{{ docker_log_max_size }}",
# "max-file": "{{ docker_log_max_file }}"
# },
# "storage-driver": "{{ docker_storage_driver }}",
# "live-restore": {{ docker_live_restore | lower }},
# "data-root": "{{ docker_data_root }}",
# "default-address-pools": [
# {
# "base": "{{ docker_default_address_pool_base }}",
# "size": {{ docker_default_address_pool_size }}
# }
# ],
# "features": {
# "buildkit": true
# }
# }
- name: Add users to docker group
ansible.builtin.user:
name: "{{ item }}"
groups: docker
append: yes
loop: "{{ docker_users }}"
tags: [docker, users]
- name: Enable and start Docker service
ansible.builtin.service:
name: docker
state: started
enabled: yes
tags: [docker, service]
- name: Verify Docker is running
ansible.builtin.command: docker info
changed_when: false
register: docker_info
tags: [docker, verify]
- name: Show Docker version
ansible.builtin.debug:
msg: "Docker {{ docker_info.stdout_lines[1] | trim }} installed on {{ inventory_hostname }}"
tags: [docker, verify]
# ===================== Playbook using this role =============
# site.yml or docker-hosts.yml:
#
# ---
# - name: Provision Docker hosts
# hosts: docker_hosts
# become: yes
# roles:
# - role: docker-host
# docker_log_max_size: "50m"
# docker_users:
# - commander
# - deploy
Ansible Inventory + Group Vars
Production Ansible inventory with host groups, group_vars, host_vars, and ansible.cfg using ArgoBox star-themed naming and 10.42.0.x addressing
# ===========================================================
# Ansible Inventory + Group Vars
# ===========================================================
# Directory layout:
# inventory/
# ├── hosts.yml # Host inventory
# ├── group_vars/
# │ ├── all.yml # Variables for every host
# │ ├── docker_hosts.yml # Docker-specific vars
# │ └── k3s_servers.yml # K3s server vars
# └── host_vars/
# └── izar.yml # Per-host overrides
#
# ansible.cfg # Project-level config
# ===========================================================
# ==================== inventory/hosts.yml ===================
# All IPs use 10.42.0.x format. Hostnames are star-themed.
# ============================================================
---
all:
children:
# ---------------------------------------------------------
# Proxmox hypervisors
# ---------------------------------------------------------
proxmox:
hosts:
izar-host:
ansible_host: 10.42.0.201
proxmox_node: izar
arcturus-host:
ansible_host: 10.42.0.100
proxmox_node: arcturus
# ---------------------------------------------------------
# Docker container hosts
# ---------------------------------------------------------
docker_hosts:
hosts:
altair-link:
ansible_host: 10.42.0.199
docker_data_root: /opt/docker
capella-outpost:
ansible_host: 10.42.0.10
docker_data_root: /var/lib/docker
# ---------------------------------------------------------
# K3s cluster -- server (control plane) nodes
# ---------------------------------------------------------
k3s_servers:
hosts:
altair-link:
ansible_host: 10.42.0.199
k3s_role: server
k3s_init: true
# ---------------------------------------------------------
# K3s cluster -- agent (worker) nodes
# ---------------------------------------------------------
k3s_agents:
hosts:
tau-host:
ansible_host: 10.42.0.175
k3s_role: agent
sirius-station:
ansible_host: 10.42.0.50
k3s_role: agent
# ---------------------------------------------------------
# Monitoring servers
# ---------------------------------------------------------
monitoring_servers:
hosts:
altair-link:
ansible_host: 10.42.0.199
# ============== inventory/group_vars/all.yml ================
# Shared variables applied to every host in the inventory.
# ============================================================
# --- all.yml ---
# ansible_user: commander
# ansible_become: true
# ansible_python_interpreter: /usr/bin/python3
#
# # DNS (local Pi-hole + fallback)
# dns_servers:
# - 10.42.0.1
# - 1.1.1.1
#
# # NTP
# ntp_servers:
# - 0.pool.ntp.org
# - 1.pool.ntp.org
#
# # Timezone
# timezone: "America/Chicago"
#
# # Admin user for all hosts
# admin_user: commander
# admin_ssh_key: "ssh-ed25519 AAAAC3NzaC1lZDI1... commander@capella-outpost"
#
# # Default domain
# domain: "argobox.local"
#
# # Notification endpoint
# ntfy_url: "https://ntfy.sh/argobox-alerts"
# ========== inventory/group_vars/docker_hosts.yml ===========
# Variables specific to hosts running Docker.
# ============================================================
# --- docker_hosts.yml ---
# docker_log_max_size: "10m"
# docker_log_max_file: "3"
# docker_storage_driver: "overlay2"
# docker_live_restore: true
# docker_default_address_pool_base: "172.17.0.0/12"
# docker_default_address_pool_size: 24
#
# # Compose stacks to deploy
# docker_stacks_dir: /opt/stacks
# docker_stacks:
# - portainer
# - traefik
# - monitoring
# ============ inventory/host_vars/izar.yml ==================
# Per-host overrides for izar-host (primary Proxmox node).
# ============================================================
# --- izar.yml ---
# proxmox_api_url: "https://10.42.0.201:8006/api2/json"
# proxmox_storage: "local-zfs"
# proxmox_backup_storage: "pbs-local"
# proxmox_vlan_aware: true
#
# # Izar has 64GB RAM -- allow larger VMs
# proxmox_default_memory: 8192
# proxmox_default_cores: 4
#
# # ZFS pool for VM storage
# zfs_pool: "rpool"
# ==================== ansible.cfg ===========================
# Project-level Ansible configuration. Place in the repo root.
# ============================================================
# [defaults]
# inventory = inventory/hosts.yml
# roles_path = roles
# vault_password_file = .vault-password
# stdout_callback = yaml
# host_key_checking = false
# retry_files_enabled = false
# gathering = smart
# fact_caching = jsonfile
# fact_caching_connection = /tmp/ansible-facts
# fact_caching_timeout = 3600
#
# [privilege_escalation]
# become = true
# become_method = sudo
# become_ask_pass = false
#
# [ssh_connection]
# pipelining = true
# ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ForwardAgent=no
Ansible Vault Usage
Working examples of Ansible Vault for encrypting secrets: vault files, inline encrypted strings, vault in CI/CD, and a playbook deploying Docker stacks with vault-stored credentials
# ===========================================================
# Ansible Vault Usage Examples
# ===========================================================
# Vault encrypts sensitive data so it can live in version control.
#
# Create encrypted file:
# ansible-vault create inventory/group_vars/vault.yml
#
# Edit encrypted file:
# ansible-vault edit inventory/group_vars/vault.yml
#
# Encrypt a single string:
# ansible-vault encrypt_string 'ExampleP@ss123!' --name 'db_password'
#
# Run playbook with vault:
# ansible-playbook -i inventory site.yml --ask-vault-pass
# ansible-playbook -i inventory site.yml --vault-password-file .vault-password
# ===========================================================
# ========= inventory/group_vars/vault.yml (encrypted) =======
# This file is encrypted at rest. Contents shown here for
# reference -- the actual file is AES-256 encrypted.
#
# Create with: ansible-vault create inventory/group_vars/vault.yml
# ============================================================
# --- vault.yml (plaintext contents before encryption) ---
# vault_db_root_password: "P@ssw0rd-CHANGE-ME"
# vault_db_gitea_password: "P@ssw0rd-CHANGE-ME"
# vault_db_nextcloud_password: "P@ssw0rd-CHANGE-ME"
# vault_grafana_admin_password: "P@ssw0rd-CHANGE-ME"
# vault_restic_repo_password: "P@ssw0rd-CHANGE-ME"
# vault_smtp_password: "P@ssw0rd-CHANGE-ME"
# vault_cloudflare_api_token: "cf-token-CHANGE-ME"
# vault_tailscale_authkey: "tskey-auth-CHANGE-ME"
# ========= Inline encrypted strings =========================
# For mixing encrypted and plaintext vars in the same file,
# use !vault | with the encrypted block from encrypt_string.
# ============================================================
# --- inventory/group_vars/docker_hosts.yml ---
# docker_registry_user: commander
# docker_registry_password: !vault |
# $ANSIBLE_VAULT;1.1;AES256
# 61626364656667686970717273747576
# ... (encrypted block from ansible-vault encrypt_string)
#
# smtp_relay_host: mail.argobox.local
# smtp_relay_password: !vault |
# $ANSIBLE_VAULT;1.1;AES256
# 31323334353637383940414243444546
# ... (encrypted block)
# ========= Playbook using vault secrets ====================
# This playbook deploys a Docker stack with database credentials
# pulled from the vault-encrypted vars file.
# ============================================================
---
- name: Deploy application stack with vault secrets
hosts: docker_hosts
become: yes
vars_files:
# Load encrypted vault file alongside regular vars
- inventory/group_vars/vault.yml
vars:
stack_name: "app-stack"
stack_dir: "/opt/stacks/{{ stack_name }}"
tasks:
- name: Create stack directory
ansible.builtin.file:
path: "{{ stack_dir }}"
state: directory
owner: commander
group: docker
mode: "0750"
- name: Deploy .env file with vault secrets
ansible.builtin.template:
src: templates/stack-env.j2
dest: "{{ stack_dir }}/.env"
owner: commander
group: docker
mode: "0600"
# Template would contain:
# POSTGRES_ROOT_PASSWORD={{ vault_db_root_password }}
# POSTGRES_GITEA_PASSWORD={{ vault_db_gitea_password }}
# GRAFANA_ADMIN_PASSWORD={{ vault_grafana_admin_password }}
# SMTP_PASSWORD={{ vault_smtp_password }}
no_log: true # Prevent secrets from appearing in output
- name: Deploy docker-compose.yml
ansible.builtin.template:
src: templates/docker-compose.yml.j2
dest: "{{ stack_dir }}/docker-compose.yml"
owner: commander
group: docker
mode: "0640"
- name: Start the stack
community.docker.docker_compose_v2:
project_src: "{{ stack_dir }}"
state: present
register: stack_result
- name: Show deployment result
ansible.builtin.debug:
msg: "Stack {{ stack_name }} deployed. Services: {{ stack_result.services | default({}) | list | join(', ') }}"
# ========= CI/CD vault integration ==========================
# For automated pipelines where --ask-vault-pass is not viable.
# ============================================================
# Option 1: vault password file (referenced in ansible.cfg)
# --- ansible.cfg ---
# [defaults]
# vault_password_file = .vault-password
#
# .vault-password contains a single line with the passphrase.
# Add to .gitignore so it never enters version control:
# echo ".vault-password" >> .gitignore
# Option 2: Environment variable lookup
# --- In a playbook or vars file ---
# vault_password_from_env: "{{ lookup('env', 'ANSIBLE_VAULT_PASSWORD') }}"
#
# CI/CD pipeline sets the env var:
# export ANSIBLE_VAULT_PASSWORD="your-vault-password-here"
# ansible-playbook site.yml
# Option 3: Script-based vault password
# --- ansible.cfg ---
# vault_password_file = scripts/get-vault-pass.sh
#
# --- scripts/get-vault-pass.sh ---
# #!/bin/bash
# # Pull vault password from a secret manager
# # e.g., from pass, 1Password CLI, or HashiCorp Vault
# pass show ansible/vault-password
# ========= Rekeying vault files =============================
# Change the encryption password on all vault files:
# ansible-vault rekey inventory/group_vars/vault.yml
# ansible-vault rekey --new-vault-password-file new-pass.txt *.yml
# ============================================================
Terraform Backend + Variables
Production Terraform setup with S3-compatible backend (Minio), state locking, typed variables with validation, outputs, and version constraints
# ===========================================================
# Terraform: Production Project Structure
# ===========================================================
# Directory layout:
# terraform-project/
# ├── backend.tf # State backend configuration
# ├── versions.tf # Provider version constraints
# ├── variables.tf # Input variable definitions
# ├── terraform.tfvars # Variable values (do not commit secrets)
# ├── main.tf # Resource definitions
# └── outputs.tf # Output values
#
# Workflow:
# terraform init # Initialize backend + download providers
# terraform plan # Preview changes (always review before apply)
# terraform apply # Apply changes
# terraform destroy # Tear down (use with caution)
# ===========================================================
# ===================== versions.tf ==========================
# Pin provider versions to avoid breaking changes.
# ============================================================
terraform {
required_version = ">= 1.7.0, < 2.0.0"
required_providers {
proxmox = {
source = "Telmate/proxmox"
version = "~> 2.9"
}
cloudflare = {
source = "cloudflare/cloudflare"
version = "~> 4.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.29"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.13"
}
}
}
# ===================== backend.tf ===========================
# S3-compatible backend using Minio on the local network.
# State locking prevents concurrent modifications.
# ============================================================
terraform {
backend "s3" {
# Minio endpoint at 10.42.0.20 (local S3-compatible storage)
endpoint = "https://10.42.0.20:9000"
bucket = "terraform-state"
key = "homelab/argobox.tfstate"
region = "us-east-1" # Required by Minio but ignored
# State locking via DynamoDB-compatible API (Minio supports this)
dynamodb_endpoint = "https://10.42.0.20:9001"
dynamodb_table = "terraform-locks"
# Minio uses path-style access, not virtual-hosted
force_path_style = true
skip_credentials_validation = true
skip_metadata_api_check = true
# Credentials: set via environment variables
# export AWS_ACCESS_KEY_ID="minio-access-key"
# export AWS_SECRET_ACCESS_KEY="minio-secret-key"
}
}
# ===================== variables.tf =========================
# Typed variables with validation and sensible defaults.
# ============================================================
variable "environment" {
description = "Deployment environment"
type = string
default = "homelab"
validation {
condition = contains(["homelab", "staging", "production"], var.environment)
error_message = "Environment must be homelab, staging, or production."
}
}
variable "proxmox_api_url" {
description = "Proxmox VE API URL"
type = string
default = "https://10.42.0.201:8006/api2/json"
validation {
condition = can(regex("^https://", var.proxmox_api_url))
error_message = "Proxmox API URL must use HTTPS."
}
}
variable "proxmox_api_token" {
description = "Proxmox API token (user@realm!tokenid=secret)"
type = string
sensitive = true
}
variable "ssh_public_key" {
description = "SSH public key for cloud-init provisioned VMs"
type = string
}
variable "vm_definitions" {
description = "Map of VMs to create"
type = map(object({
cores = number
memory = number
disk = string
ip = string
node = string
template = optional(string, "ubuntu-cloud-template")
}))
validation {
condition = alltrue([for vm in var.vm_definitions : vm.cores >= 1 && vm.cores <= 32])
error_message = "VM cores must be between 1 and 32."
}
validation {
condition = alltrue([for vm in var.vm_definitions : vm.memory >= 512])
error_message = "VM memory must be at least 512 MB."
}
}
variable "gateway" {
description = "Default gateway for the 10.42.0.x network"
type = string
default = "10.42.0.1"
}
variable "dns_servers" {
description = "DNS servers for VMs"
type = list(string)
default = ["10.42.0.1", "1.1.1.1"]
}
# ===================== terraform.tfvars =====================
# Example variable values. Copy to terraform.tfvars and fill in.
# Do NOT commit this file if it contains secrets.
# ============================================================
# --- terraform.tfvars ---
# environment = "homelab"
#
# proxmox_api_token = "terraform@pam!tf-token=xxxx-xxxx-xxxx"
#
# ssh_public_key = "ssh-ed25519 AAAAC3NzaC1lZDI1... commander@capella-outpost"
#
# vm_definitions = {
# "gitea" = {
# cores = 2
# memory = 4096
# disk = "32G"
# ip = "10.42.0.30"
# node = "izar"
# }
# "monitoring" = {
# cores = 4
# memory = 8192
# disk = "64G"
# ip = "10.42.0.31"
# node = "izar"
# }
# }
# ===================== outputs.tf ===========================
# Expose useful information after apply.
# ============================================================
# output "vm_ips" {
# description = "Map of VM names to their IP addresses"
# value = { for name, vm in var.vm_definitions : name => vm.ip }
# }
#
# output "vm_ids" {
# description = "Map of VM names to Proxmox VM IDs"
# value = { for name, vm in proxmox_vm_qemu.vms : name => vm.vmid }
# }
#
# output "proxmox_url" {
# description = "Proxmox web UI URL"
# value = var.proxmox_api_url
# }
#
# output "state_backend" {
# description = "Terraform state location"
# value = "s3://terraform-state/homelab/argobox.tfstate @ 10.42.0.20"
# }
Terraform K8s Namespace Provisioning
Provision Kubernetes namespaces with ResourceQuotas, LimitRanges, default-deny NetworkPolicies, and registry credentials using for_each for scalable management
# ===========================================================
# Terraform: Kubernetes Namespace Provisioning
# ===========================================================
# Creates namespaces with security defaults:
# - ResourceQuota (prevent resource exhaustion)
# - LimitRange (enforce per-pod limits)
# - NetworkPolicy (default-deny ingress)
# - Registry pull secret (private container images)
#
# Uses for_each to manage multiple namespaces from a variable.
# ===========================================================
terraform {
required_providers {
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.29"
}
}
}
# Configure the K8s provider to talk to the local cluster
provider "kubernetes" {
config_path = "~/.kube/config"
config_context = "k3s-homelab"
}
# ===================== Variables ============================
variable "namespaces" {
description = "Map of namespace names to their quota settings"
type = map(object({
cpu_request = string
cpu_limit = string
memory_request = string
memory_limit = string
pod_limit = number
labels = optional(map(string), {})
}))
default = {
"gitea" = {
cpu_request = "2"
cpu_limit = "4"
memory_request = "2Gi"
memory_limit = "4Gi"
pod_limit = 20
labels = { tier = "platform", app = "gitea" }
}
"monitoring" = {
cpu_request = "4"
cpu_limit = "8"
memory_request = "4Gi"
memory_limit = "8Gi"
pod_limit = 50
labels = { tier = "observability" }
}
"apps" = {
cpu_request = "2"
cpu_limit = "4"
memory_request = "2Gi"
memory_limit = "4Gi"
pod_limit = 30
labels = { tier = "application" }
}
}
}
variable "registry_server" {
description = "Container registry URL"
type = string
default = "https://registry.10.42.0.199.nip.io"
}
variable "registry_username" {
description = "Registry pull credentials -- username"
type = string
default = "commander"
}
variable "registry_password" {
description = "Registry pull credentials -- password"
type = string
sensitive = true
}
# ===================== Namespaces ===========================
resource "kubernetes_namespace" "ns" {
for_each = var.namespaces
metadata {
name = each.key
labels = merge(
{
"managed-by" = "terraform"
"environment" = "homelab"
},
each.value.labels
)
annotations = {
"description" = "Managed by Terraform -- do not modify manually"
}
}
}
# ===================== ResourceQuota ========================
# Prevents any single namespace from consuming all cluster resources.
resource "kubernetes_resource_quota" "quota" {
for_each = var.namespaces
metadata {
name = "${each.key}-quota"
namespace = kubernetes_namespace.ns[each.key].metadata[0].name
}
spec {
hard = {
"requests.cpu" = each.value.cpu_request
"limits.cpu" = each.value.cpu_limit
"requests.memory" = each.value.memory_request
"limits.memory" = each.value.memory_limit
"pods" = each.value.pod_limit
}
}
}
# ===================== LimitRange ===========================
# Sets default requests/limits for pods that do not specify them.
resource "kubernetes_limit_range" "limits" {
for_each = var.namespaces
metadata {
name = "${each.key}-limits"
namespace = kubernetes_namespace.ns[each.key].metadata[0].name
}
spec {
limit {
type = "Container"
default = {
cpu = "500m"
memory = "512Mi"
}
default_request = {
cpu = "100m"
memory = "128Mi"
}
max = {
cpu = each.value.cpu_limit
memory = each.value.memory_limit
}
}
}
}
# ===================== NetworkPolicy ========================
# Default-deny ingress for every namespace. Services that need
# ingress must define their own NetworkPolicy to allow it.
resource "kubernetes_network_policy" "default_deny" {
for_each = var.namespaces
metadata {
name = "default-deny-ingress"
namespace = kubernetes_namespace.ns[each.key].metadata[0].name
}
spec {
pod_selector {}
policy_types = ["Ingress"]
# No ingress rules = deny all ingress by default.
# Individual apps should create allow policies as needed.
}
}
# ===================== Registry Secret ======================
# Deploy image pull secrets so pods can pull from private registries.
resource "kubernetes_secret" "registry_creds" {
for_each = var.namespaces
metadata {
name = "registry-credentials"
namespace = kubernetes_namespace.ns[each.key].metadata[0].name
}
type = "kubernetes.io/dockerconfigjson"
data = {
".dockerconfigjson" = jsonencode({
auths = {
(var.registry_server) = {
username = var.registry_username
password = var.registry_password
auth = base64encode("${var.registry_username}:${var.registry_password}")
}
}
})
}
}
# ===================== Outputs ==============================
output "namespace_names" {
description = "Created namespace names"
value = [for ns in kubernetes_namespace.ns : ns.metadata[0].name]
}
Packer VM Template
Build immutable VM images for Proxmox using Packer with Ubuntu cloud image source, shell and Ansible provisioners, and automatic template conversion
# ===========================================================
# Packer: Ubuntu VM Template for Proxmox
# ===========================================================
# Builds an immutable, pre-configured Ubuntu VM template
# that can be cloned by Terraform or the Proxmox UI.
#
# Build command:
# packer init ubuntu-template.pkr.hcl
# packer validate -var-file=variables.pkrvars.hcl ubuntu-template.pkr.hcl
# packer build -var-file=variables.pkrvars.hcl ubuntu-template.pkr.hcl
# ===========================================================
packer {
required_plugins {
proxmox = {
version = ">= 1.1.8"
source = "github.com/hashicorp/proxmox"
}
ansible = {
version = ">= 1.1.1"
source = "github.com/hashicorp/ansible"
}
}
}
# ===================== Variables ============================
variable "proxmox_api_url" {
type = string
description = "Proxmox API endpoint"
default = "https://10.42.0.201:8006/api2/json"
}
variable "proxmox_api_token_id" {
type = string
description = "Proxmox API token ID (user@realm!tokenname)"
sensitive = true
}
variable "proxmox_api_token_secret" {
type = string
description = "Proxmox API token secret"
sensitive = true
}
variable "proxmox_node" {
type = string
default = "izar"
}
variable "template_name" {
type = string
default = "ubuntu-cloud-template"
}
variable "ubuntu_iso" {
type = string
description = "Ubuntu cloud image ISO"
default = "local:iso/ubuntu-24.04-live-server-amd64.iso"
}
variable "ssh_username" {
type = string
default = "commander"
}
variable "ssh_password" {
type = string
sensitive = true
default = "your-password-here"
}
variable "vm_cores" {
type = number
default = 2
}
variable "vm_memory" {
type = number
default = 4096
}
variable "vm_disk_size" {
type = string
default = "32G"
}
# ===================== Source ===============================
# Proxmox ISO builder -- boots from the Ubuntu ISO, runs
# autoinstall, then hands off to provisioners.
# ============================================================
source "proxmox-iso" "ubuntu" {
# Connection
proxmox_url = var.proxmox_api_url
username = var.proxmox_api_token_id
token = var.proxmox_api_token_secret
insecure_skip_tls_verify = true
node = var.proxmox_node
# VM settings
vm_id = 9000
vm_name = var.template_name
# Hardware
cores = var.vm_cores
sockets = 1
memory = var.vm_memory
os = "l26" # Linux 2.6+ kernel
# Boot disk
disks {
storage_pool = "local-zfs"
disk_size = var.vm_disk_size
type = "scsi"
format = "raw"
}
# Network
network_adapters {
model = "virtio"
bridge = "vmbr0"
firewall = false
}
# ISO and boot
iso_file = var.ubuntu_iso
boot_command = [
"<esc><wait>",
"linux /casper/vmlinuz --- autoinstall",
"<enter><wait>",
"initrd /casper/initrd",
"<enter><wait>",
"boot<enter>"
]
boot_wait = "10s"
# Cloud-init drive
cloud_init = true
cloud_init_storage_pool = "local-zfs"
# SSH connection for provisioning
ssh_username = var.ssh_username
ssh_password = var.ssh_password
ssh_timeout = "20m"
# QEMU guest agent (required for Terraform integration)
qemu_agent = true
# Convert to template when done
template_name = var.template_name
template_description = "Ubuntu 24.04 template built by Packer on ${timestamp()}"
# Tags for organization in the Proxmox UI
tags = "template;ubuntu;packer"
}
# ===================== Build ================================
build {
sources = ["source.proxmox-iso.ubuntu"]
# ---------------------------------------------------------
# Provisioner 1: Shell -- base system configuration
# ---------------------------------------------------------
provisioner "shell" {
inline = [
"# Wait for cloud-init to finish",
"cloud-init status --wait",
"",
"# Update package index and upgrade",
"sudo apt-get update -y",
"sudo apt-get upgrade -y",
"",
"# Install essential packages",
"sudo apt-get install -y \\",
" qemu-guest-agent \\",
" curl wget git vim \\",
" ca-certificates gnupg \\",
" python3 python3-pip \\",
" unattended-upgrades \\",
" fail2ban \\",
" htop tmux jq",
"",
"# Enable QEMU guest agent",
"sudo systemctl enable --now qemu-guest-agent",
"",
"# Clean up apt cache to reduce image size",
"sudo apt-get autoremove -y",
"sudo apt-get clean",
"",
"# Zero free space for better compression",
"sudo dd if=/dev/zero of=/EMPTY bs=1M 2>/dev/null || true",
"sudo rm -f /EMPTY",
"",
"# Clear machine-id so each clone gets a unique ID",
"sudo truncate -s 0 /etc/machine-id",
"sudo rm -f /var/lib/dbus/machine-id"
]
}
# ---------------------------------------------------------
# Provisioner 2: File -- copy hardened SSH config
# ---------------------------------------------------------
provisioner "file" {
source = "files/sshd_config"
destination = "/tmp/sshd_config"
}
provisioner "shell" {
inline = [
"sudo mv /tmp/sshd_config /etc/ssh/sshd_config",
"sudo chown root:root /etc/ssh/sshd_config",
"sudo chmod 600 /etc/ssh/sshd_config"
]
}
# ---------------------------------------------------------
# Provisioner 3: Ansible -- run hardening role
# ---------------------------------------------------------
provisioner "ansible" {
playbook_file = "ansible/harden.yml"
user = var.ssh_username
extra_arguments = [
"--extra-vars", "target_host=default"
]
}
}
# ===================== variables.pkrvars.hcl ================
# Example variable values file. Copy and fill in secrets.
# ============================================================
# --- variables.pkrvars.hcl ---
# proxmox_api_url = "https://10.42.0.201:8006/api2/json"
# proxmox_api_token_id = "packer@pam!packer-token"
# proxmox_api_token_secret = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
# proxmox_node = "izar"
# ssh_password = "temporary-build-password"
Gentoo/OpenRC Service Script
Complete OpenRC init script with depend(), start(), stop(), status(), checkconfig(), start-stop-daemon usage, conf.d configuration, and a comparison with the equivalent systemd unit file
#!/sbin/openrc-run
# ===================================================================
# OpenRC Init Script: myapp
# ===================================================================
# A complete OpenRC service script for a generic application daemon.
# Designed for Gentoo Linux but works on any OpenRC distribution
# (Alpine, Artix, etc.).
#
# Installation:
# 1. Copy this file to /etc/init.d/myapp
# 2. chmod +x /etc/init.d/myapp
# 3. Copy conf.d file to /etc/conf.d/myapp
# 4. rc-update add myapp default
# 5. rc-service myapp start
#
# Management:
# rc-service myapp start|stop|restart|status
# rc-service myapp checkconfig # Validate before starting
# ===================================================================
# ---------------------------------------------------------------
# Service metadata -- used by rc-status and service listings
# ---------------------------------------------------------------
name="myapp"
description="MyApp Application Server"
extra_commands="checkconfig"
extra_started_commands="reload"
description_checkconfig="Verify configuration before start"
description_reload="Reload configuration without restart"
# ---------------------------------------------------------------
# Read configuration from /etc/conf.d/myapp
# Variables set there override the defaults below.
# ---------------------------------------------------------------
# Defaults (overridden by /etc/conf.d/myapp)
: ${MYAPP_USER:="myapp"}
: ${MYAPP_GROUP:="myapp"}
: ${MYAPP_CONFIG:="/etc/myapp/config.yaml"}
: ${MYAPP_DATADIR:="/var/lib/myapp"}
: ${MYAPP_LOGDIR:="/var/log/myapp"}
: ${MYAPP_BIND:="10.42.0.199"}
: ${MYAPP_PORT:="8080"}
: ${MYAPP_EXTRA_OPTS:=""}
command="/usr/local/bin/myapp"
command_args="--config ${MYAPP_CONFIG} --bind ${MYAPP_BIND}:${MYAPP_PORT} ${MYAPP_EXTRA_OPTS}"
command_user="${MYAPP_USER}:${MYAPP_GROUP}"
command_background=true
pidfile="/run/${RC_SVCNAME}.pid"
output_log="${MYAPP_LOGDIR}/${RC_SVCNAME}.log"
error_log="${MYAPP_LOGDIR}/${RC_SVCNAME}.err"
# ---------------------------------------------------------------
# depend() -- declare service dependencies and ordering
# ---------------------------------------------------------------
depend() {
# 'need' = hard dependency, service will not start without these
need net
need localmount
# 'after' = start after these if they exist (soft ordering)
after firewall
after dns
after postgresql
# 'use' = use if available, but not required
use logger
use dns
# 'provide' = this service provides the "webapp" virtual service
provide webapp
}
# ---------------------------------------------------------------
# checkconfig() -- validate configuration before starting
# Called automatically by start_pre(), also callable manually:
# rc-service myapp checkconfig
# ---------------------------------------------------------------
checkconfig() {
# Verify the config file exists and is readable
if [ ! -f "${MYAPP_CONFIG}" ]; then
eerror "Config file not found: ${MYAPP_CONFIG}"
return 1
fi
# Verify the binary exists
if [ ! -x "${command}" ]; then
eerror "Binary not found or not executable: ${command}"
return 1
fi
# Run the application's built-in config validation if available
if "${command}" --validate-config "${MYAPP_CONFIG}" >/dev/null 2>&1; then
einfo "Configuration is valid"
else
eerror "Configuration validation failed"
return 1
fi
return 0
}
# ---------------------------------------------------------------
# start_pre() -- runs before start(). Create directories, check config.
# ---------------------------------------------------------------
start_pre() {
# Validate configuration first
checkconfig || return 1
# Create required directories with correct ownership
checkpath --directory --owner "${MYAPP_USER}:${MYAPP_GROUP}" \
--mode 0750 "${MYAPP_DATADIR}"
checkpath --directory --owner "${MYAPP_USER}:${MYAPP_GROUP}" \
--mode 0755 "${MYAPP_LOGDIR}"
checkpath --directory --owner root:root \
--mode 0755 /run
}
# ---------------------------------------------------------------
# start() -- launch the daemon using start-stop-daemon
# ---------------------------------------------------------------
start() {
ebegin "Starting ${name}"
start-stop-daemon --start \
--exec "${command}" \
--user "${MYAPP_USER}" \
--group "${MYAPP_GROUP}" \
--background \
--make-pidfile \
--pidfile "${pidfile}" \
--stdout "${output_log}" \
--stderr "${error_log}" \
--wait 1000 \
-- ${command_args}
eend $?
}
# ---------------------------------------------------------------
# stop() -- gracefully stop the daemon
# ---------------------------------------------------------------
stop() {
ebegin "Stopping ${name}"
start-stop-daemon --stop \
--pidfile "${pidfile}" \
--retry "SIGTERM/30/SIGKILL/5" \
--exec "${command}"
eend $?
}
# ---------------------------------------------------------------
# reload() -- reload configuration without full restart
# Sends SIGHUP to the process, which most daemons handle
# as a config reload signal.
# ---------------------------------------------------------------
reload() {
checkconfig || return 1
ebegin "Reloading ${name} configuration"
start-stop-daemon --signal HUP --pidfile "${pidfile}"
eend $?
}
# ---------------------------------------------------------------
# status() -- check if the service is running
# ---------------------------------------------------------------
status() {
if [ -f "${pidfile}" ]; then
if kill -0 $(cat "${pidfile}") 2>/dev/null; then
einfo "${name} is running (PID: $(cat "${pidfile}"))"
return 0
else
ewarn "${name} has a stale pidfile (PID: $(cat "${pidfile}"))"
return 1
fi
fi
einfo "${name} is not running"
return 3
}
# ===================================================================
# /etc/conf.d/myapp -- Configuration file
# ===================================================================
# Place this content in /etc/conf.d/myapp to override defaults:
#
# # User and group to run as
# MYAPP_USER="myapp"
# MYAPP_GROUP="myapp"
#
# # Configuration file path
# MYAPP_CONFIG="/etc/myapp/config.yaml"
#
# # Data and log directories
# MYAPP_DATADIR="/var/lib/myapp"
# MYAPP_LOGDIR="/var/log/myapp"
#
# # Listen address and port (use 10.42.0.x format)
# MYAPP_BIND="10.42.0.199"
# MYAPP_PORT="8080"
#
# # Additional command-line options
# MYAPP_EXTRA_OPTS="--verbose --max-connections 100"
# ===================================================================
# Equivalent systemd unit file (for reference/comparison)
# ===================================================================
# [Unit]
# Description=MyApp Application Server
# After=network-online.target postgresql.service
# Wants=network-online.target
# Requires=postgresql.service
#
# [Service]
# Type=simple
# User=myapp
# Group=myapp
# ExecStartPre=/usr/local/bin/myapp --validate-config /etc/myapp/config.yaml
# ExecStart=/usr/local/bin/myapp --config /etc/myapp/config.yaml --bind 10.42.0.199:8080
# ExecReload=/bin/kill -HUP $MAINPID
# Restart=on-failure
# RestartSec=5
# StandardOutput=append:/var/log/myapp/myapp.log
# StandardError=append:/var/log/myapp/myapp.err
# LimitNOFILE=65536
#
# [Install]
# WantedBy=multi-user.target