Skip to main content
ansible docker-stack.yml

Docker Stack Deployment

Deploy and configure Docker with common containers across multiple hosts

yaml
---
# Ansible Playbook: Deploy Docker Stack
# Usage: ansible-playbook -i inventory docker-stack.yml

- name: Deploy Docker and common containers
  hosts: docker_hosts
  become: yes
  vars:
    docker_compose_version: "2.24.0"
    containers:
      - name: portainer
        image: portainer/portainer-ce:latest
        ports: ["9443:9443"]
        volumes: ["/var/run/docker.sock:/var/run/docker.sock", "portainer_data:/data"]
      - name: watchtower
        image: containrrr/watchtower:latest
        volumes: ["/var/run/docker.sock:/var/run/docker.sock"]
        environment:
          WATCHTOWER_CLEANUP: "true"
          WATCHTOWER_SCHEDULE: "0 0 4 * * *"

  tasks:
    - name: Install Docker dependencies
      apt:
        name:
          - apt-transport-https
          - ca-certificates
          - curl
          - gnupg
          - lsb-release
        state: present
        update_cache: yes

    - name: Add Docker GPG key
      apt_key:
        url: https://download.docker.com/linux/ubuntu/gpg
        state: present

    - name: Add Docker repository
      apt_repository:
        repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
        state: present

    - name: Install Docker
      apt:
        name:
          - docker-ce
          - docker-ce-cli
          - containerd.io
          - docker-buildx-plugin
          - docker-compose-plugin
        state: present

    - name: Start and enable Docker
      systemd:
        name: docker
        state: started
        enabled: yes

    - name: Deploy containers
      community.docker.docker_container:
        name: "{{ item.name }}"
        image: "{{ item.image }}"
        ports: "{{ item.ports | default(omit) }}"
        volumes: "{{ item.volumes | default(omit) }}"
        env: "{{ item.environment | default(omit) }}"
        restart_policy: unless-stopped
      loop: "{{ containers }}"
ansible tailscale-setup.yml

Tailscale Mesh VPN Setup

Install and configure Tailscale across all nodes with subnet routing

yaml
---
# Ansible Playbook: Tailscale Mesh VPN
# Usage: ansible-playbook -i inventory tailscale-setup.yml -e "tailscale_authkey=tskey-xxx"

- name: Install and configure Tailscale
  hosts: all
  become: yes
  vars:
    tailscale_authkey: "{{ lookup('env', 'TAILSCALE_AUTHKEY') }}"
    subnet_routers:
      - host: alpha-centauri
        advertise_routes: "10.42.0.0/24"
      - host: titawin-host
        advertise_routes: "192.168.20.0/24"

  tasks:
    - name: Add Tailscale repository key
      apt_key:
        url: https://pkgs.tailscale.com/stable/ubuntu/jammy.noarmor.gpg
        state: present

    - name: Add Tailscale repository
      apt_repository:
        repo: "deb https://pkgs.tailscale.com/stable/ubuntu jammy main"
        state: present

    - name: Install Tailscale
      apt:
        name: tailscale
        state: present
        update_cache: yes

    - name: Enable IP forwarding (for subnet routers)
      sysctl:
        name: "{{ item }}"
        value: "1"
        sysctl_set: yes
        reload: yes
      loop:
        - net.ipv4.ip_forward
        - net.ipv6.conf.all.forwarding
      when: inventory_hostname in (subnet_routers | map(attribute='host'))

    - name: Start Tailscale service
      systemd:
        name: tailscaled
        state: started
        enabled: yes

    - name: Authenticate with Tailscale
      command: >
        tailscale up
        --authkey={{ tailscale_authkey }}
        --ssh
        --accept-routes
        {% if inventory_hostname in (subnet_routers | map(attribute='host')) %}
        --advertise-routes={{ (subnet_routers | selectattr('host', 'eq', inventory_hostname) | first).advertise_routes }}
        {% endif %}
      register: tailscale_up
      changed_when: "'Success' in tailscale_up.stdout"

    - name: Get Tailscale status
      command: tailscale status
      register: ts_status
      changed_when: false

    - name: Display Tailscale IP
      debug:
        msg: "{{ inventory_hostname }} Tailscale IP: {{ ts_status.stdout_lines[0] }}"
ansible monitoring-stack.yml

Prometheus + Node Exporter + Alertmanager

Full monitoring stack with Prometheus, Node Exporter, and Alertmanager using version-managed downloads, dedicated service users, and systemd units with proper handlers

yaml
---
# Ansible Playbook: Prometheus Monitoring Stack
# Deploys Prometheus, Node Exporter, and Alertmanager to monitoring hosts
# Usage: ansible-playbook -i inventory monitoring-stack.yml
# Selective: ansible-playbook -i inventory monitoring-stack.yml --tags prometheus
#            ansible-playbook -i inventory monitoring-stack.yml --tags node_exporter
#            ansible-playbook -i inventory monitoring-stack.yml --tags alertmanager

- name: Deploy Prometheus monitoring stack
  hosts: monitoring_servers
  become: yes
  vars:
    # -----------------------------------------------------------------
    # Version variables: update these instead of hardcoding URLs
    # -----------------------------------------------------------------
    prometheus_version: "2.51.2"
    node_exporter_version: "1.8.1"
    alertmanager_version: "0.27.0"

    # Architecture (amd64, arm64)
    arch: "amd64"

    # Paths
    prometheus_config_dir: /etc/prometheus
    prometheus_data_dir: /var/lib/prometheus
    alertmanager_config_dir: /etc/alertmanager
    alertmanager_data_dir: /var/lib/alertmanager

    # Network -- all IPs use 10.42.0.x format
    prometheus_listen: "10.42.0.30:9090"
    alertmanager_listen: "10.42.0.30:9093"
    node_exporter_listen: "0.0.0.0:9100"

    # Scrape targets -- add hosts here
    scrape_targets:
      - "10.42.0.30:9100"   # monitoring server itself
      - "10.42.0.201:9100"  # izar-host
      - "10.42.0.199:9100"  # altair-link
      - "10.42.0.175:9100"  # tau-host
      - "10.42.0.10:9100"   # capella-outpost

    # Alertmanager Slack webhook (store in vault for production)
    slack_webhook_url: "https://hooks.slack.com/services/REPLACE/WITH/REAL"
    slack_channel: "#homelab-alerts"

    # Retention
    prometheus_retention: "30d"
    prometheus_retention_size: "10GB"

  # =================================================================
  # HANDLERS -- restart services only when config changes
  # =================================================================
  handlers:
    - name: Restart prometheus
      ansible.builtin.systemd:
        name: prometheus
        state: restarted
        daemon_reload: yes
      tags: [prometheus]

    - name: Restart node_exporter
      ansible.builtin.systemd:
        name: node_exporter
        state: restarted
        daemon_reload: yes
      tags: [node_exporter]

    - name: Restart alertmanager
      ansible.builtin.systemd:
        name: alertmanager
        state: restarted
        daemon_reload: yes
      tags: [alertmanager]

    - name: Reload prometheus config
      ansible.builtin.systemd:
        name: prometheus
        state: reloaded
      tags: [prometheus]

  tasks:
    # ===============================================================
    # PROMETHEUS
    # ===============================================================
    - name: Create prometheus system user
      ansible.builtin.user:
        name: prometheus
        system: yes
        shell: /usr/sbin/nologin
        create_home: no
      tags: [prometheus]

    - name: Create Prometheus directories
      ansible.builtin.file:
        path: "{{ item }}"
        state: directory
        owner: prometheus
        group: prometheus
        mode: "0755"
      loop:
        - "{{ prometheus_config_dir }}"
        - "{{ prometheus_config_dir }}/rules"
        - "{{ prometheus_data_dir }}"
      tags: [prometheus]

    - name: Download Prometheus (version-managed)
      ansible.builtin.get_url:
        url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ arch }}.tar.gz"
        dest: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
        checksum: "sha256:https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/sha256sums.txt"
      tags: [prometheus]

    - name: Extract Prometheus binaries
      ansible.builtin.unarchive:
        src: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
        dest: /tmp
        remote_src: yes
        creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ arch }}/prometheus"
      tags: [prometheus]

    - name: Install Prometheus binaries
      ansible.builtin.copy:
        src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ arch }}/{{ item }}"
        dest: "/usr/local/bin/{{ item }}"
        remote_src: yes
        owner: root
        group: root
        mode: "0755"
      loop:
        - prometheus
        - promtool
      notify: Restart prometheus
      tags: [prometheus]

    - name: Deploy Prometheus configuration
      ansible.builtin.template:
        src: prometheus.yml.j2
        dest: "{{ prometheus_config_dir }}/prometheus.yml"
        owner: prometheus
        group: prometheus
        mode: "0644"
        validate: "/usr/local/bin/promtool check config %s"
      notify: Reload prometheus config
      tags: [prometheus]
      # --- Template content (prometheus.yml.j2) ---
      # global:
      #   scrape_interval: 15s
      #   evaluation_interval: 15s
      # alerting:
      #   alertmanagers:
      #     - static_configs:
      #         - targets: ['{{ alertmanager_listen }}']
      # rule_files:
      #   - "rules/*.yml"
      # scrape_configs:
      #   - job_name: 'node'
      #     static_configs:
      #       - targets: {{ scrape_targets | to_json }}

    - name: Deploy Prometheus systemd unit
      # Note: for OpenRC hosts, deploy an /etc/init.d/prometheus script
      # and /etc/conf.d/prometheus instead of this systemd unit
      ansible.builtin.template:
        src: prometheus.service.j2
        dest: /etc/systemd/system/prometheus.service
        mode: "0644"
      notify: Restart prometheus
      tags: [prometheus]
      # --- Unit file content ---
      # [Unit]
      # Description=Prometheus Monitoring
      # After=network-online.target
      # Wants=network-online.target
      # [Service]
      # User=prometheus
      # Group=prometheus
      # Type=simple
      # ExecStart=/usr/local/bin/prometheus \
      #   --config.file={{ prometheus_config_dir }}/prometheus.yml \
      #   --storage.tsdb.path={{ prometheus_data_dir }} \
      #   --storage.tsdb.retention.time={{ prometheus_retention }} \
      #   --storage.tsdb.retention.size={{ prometheus_retention_size }} \
      #   --web.listen-address={{ prometheus_listen }} \
      #   --web.enable-lifecycle
      # ExecReload=/bin/kill -HUP $MAINPID
      # Restart=on-failure
      # RestartSec=5
      # LimitNOFILE=65536
      # [Install]
      # WantedBy=multi-user.target

    - name: Enable and start Prometheus
      ansible.builtin.systemd:
        name: prometheus
        state: started
        enabled: yes
      tags: [prometheus]

    # ===============================================================
    # NODE EXPORTER
    # ===============================================================
    - name: Create node_exporter system user
      ansible.builtin.user:
        name: node_exporter
        system: yes
        shell: /usr/sbin/nologin
        create_home: no
      tags: [node_exporter]

    - name: Download Node Exporter (version-managed)
      ansible.builtin.get_url:
        url: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-{{ arch }}.tar.gz"
        dest: "/tmp/node_exporter-{{ node_exporter_version }}.tar.gz"
        checksum: "sha256:https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/sha256sums.txt"
      tags: [node_exporter]

    - name: Extract Node Exporter
      ansible.builtin.unarchive:
        src: "/tmp/node_exporter-{{ node_exporter_version }}.tar.gz"
        dest: /tmp
        remote_src: yes
        creates: "/tmp/node_exporter-{{ node_exporter_version }}.linux-{{ arch }}/node_exporter"
      tags: [node_exporter]

    - name: Install Node Exporter binary
      ansible.builtin.copy:
        src: "/tmp/node_exporter-{{ node_exporter_version }}.linux-{{ arch }}/node_exporter"
        dest: /usr/local/bin/node_exporter
        remote_src: yes
        owner: root
        group: root
        mode: "0755"
      notify: Restart node_exporter
      tags: [node_exporter]

    - name: Deploy Node Exporter systemd unit
      # Note: for OpenRC, create /etc/init.d/node_exporter with
      # start-stop-daemon --start --user node_exporter --exec /usr/local/bin/node_exporter
      ansible.builtin.copy:
        dest: /etc/systemd/system/node_exporter.service
        mode: "0644"
        content: |
          [Unit]
          Description=Prometheus Node Exporter
          After=network-online.target
          Wants=network-online.target

          [Service]
          User=node_exporter
          Group=node_exporter
          Type=simple
          ExecStart=/usr/local/bin/node_exporter \
            --web.listen-address={{ node_exporter_listen }} \
            --collector.systemd \
            --collector.processes \
            --collector.filesystem.mount-points-exclude="^/(sys|proc|dev|host|etc)($$|/)" \
            --collector.netclass.ignored-devices="^(veth|docker|br-).*"
          Restart=on-failure
          RestartSec=5

          [Install]
          WantedBy=multi-user.target
      notify: Restart node_exporter
      tags: [node_exporter]

    - name: Enable and start Node Exporter
      ansible.builtin.systemd:
        name: node_exporter
        state: started
        enabled: yes
      tags: [node_exporter]

    # ===============================================================
    # ALERTMANAGER
    # ===============================================================
    - name: Create alertmanager system user
      ansible.builtin.user:
        name: alertmanager
        system: yes
        shell: /usr/sbin/nologin
        create_home: no
      tags: [alertmanager]

    - name: Create Alertmanager directories
      ansible.builtin.file:
        path: "{{ item }}"
        state: directory
        owner: alertmanager
        group: alertmanager
        mode: "0755"
      loop:
        - "{{ alertmanager_config_dir }}"
        - "{{ alertmanager_data_dir }}"
      tags: [alertmanager]

    - name: Download Alertmanager (version-managed)
      ansible.builtin.get_url:
        url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-{{ arch }}.tar.gz"
        dest: "/tmp/alertmanager-{{ alertmanager_version }}.tar.gz"
        checksum: "sha256:https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/sha256sums.txt"
      tags: [alertmanager]

    - name: Extract Alertmanager
      ansible.builtin.unarchive:
        src: "/tmp/alertmanager-{{ alertmanager_version }}.tar.gz"
        dest: /tmp
        remote_src: yes
        creates: "/tmp/alertmanager-{{ alertmanager_version }}.linux-{{ arch }}/alertmanager"
      tags: [alertmanager]

    - name: Install Alertmanager binaries
      ansible.builtin.copy:
        src: "/tmp/alertmanager-{{ alertmanager_version }}.linux-{{ arch }}/{{ item }}"
        dest: "/usr/local/bin/{{ item }}"
        remote_src: yes
        owner: root
        group: root
        mode: "0755"
      loop:
        - alertmanager
        - amtool
      notify: Restart alertmanager
      tags: [alertmanager]

    - name: Deploy Alertmanager configuration
      ansible.builtin.template:
        src: alertmanager.yml.j2
        dest: "{{ alertmanager_config_dir }}/alertmanager.yml"
        owner: alertmanager
        group: alertmanager
        mode: "0644"
        validate: "/usr/local/bin/amtool check-config %s"
      notify: Restart alertmanager
      tags: [alertmanager]
      # --- Template content (alertmanager.yml.j2) ---
      # global:
      #   resolve_timeout: 5m
      # route:
      #   group_by: ['alertname', 'job']
      #   group_wait: 30s
      #   group_interval: 5m
      #   repeat_interval: 4h
      #   receiver: 'slack-notifications'
      # receivers:
      #   - name: 'slack-notifications'
      #     slack_configs:
      #       - api_url: '{{ slack_webhook_url }}'
      #         channel: '{{ slack_channel }}'
      #         send_resolved: true
      #         title: '[{{ "{{" }} .Status | toUpper {{ "}}" }}] {{ "{{" }} .CommonLabels.alertname {{ "}}" }}'
      #         text: '{{ "{{" }} range .Alerts {{ "}}" }}*{{ "{{" }} .Annotations.summary {{ "}}" }}*{{ "{{" }} end {{ "}}" }}'

    - name: Deploy Alertmanager systemd unit
      ansible.builtin.copy:
        dest: /etc/systemd/system/alertmanager.service
        mode: "0644"
        content: |
          [Unit]
          Description=Prometheus Alertmanager
          After=network-online.target
          Wants=network-online.target

          [Service]
          User=alertmanager
          Group=alertmanager
          Type=simple
          ExecStart=/usr/local/bin/alertmanager \
            --config.file={{ alertmanager_config_dir }}/alertmanager.yml \
            --storage.path={{ alertmanager_data_dir }} \
            --web.listen-address={{ alertmanager_listen }}
          ExecReload=/bin/kill -HUP $MAINPID
          Restart=on-failure
          RestartSec=5

          [Install]
          WantedBy=multi-user.target
      notify: Restart alertmanager
      tags: [alertmanager]

    - name: Enable and start Alertmanager
      ansible.builtin.systemd:
        name: alertmanager
        state: started
        enabled: yes
      tags: [alertmanager]
terraform proxmox-vm/main.tf

Proxmox VM Provisioning

Terraform module to create VMs on Proxmox with cloud-init

hcl
# Terraform: Proxmox VM Module
# Creates VMs with cloud-init configuration

terraform {
  required_providers {
    proxmox = {
      source  = "Telmate/proxmox"
      version = "~> 2.9"
    }
  }
}

variable "proxmox_host" {
  description = "Proxmox host IP"
  default     = "10.42.0.201"
}

variable "vm_name" {
  description = "Name of the VM"
  type        = string
}

variable "target_node" {
  description = "Proxmox node to deploy on"
  default     = "icarus"
}

variable "cores" {
  description = "Number of CPU cores"
  default     = 4
}

variable "memory" {
  description = "RAM in MB"
  default     = 4096
}

variable "disk_size" {
  description = "Boot disk size"
  default     = "32G"
}

variable "ip_address" {
  description = "Static IP address"
  type        = string
}

variable "gateway" {
  description = "Network gateway"
  default     = "10.42.0.1"
}

variable "ssh_keys" {
  description = "SSH public keys for cloud-init"
  type        = string
}

resource "proxmox_vm_qemu" "vm" {
  name        = var.vm_name
  target_node = var.target_node
  clone       = "ubuntu-cloud-template"

  cores   = var.cores
  sockets = 1
  memory  = var.memory

  agent = 1  # Enable QEMU guest agent

  disk {
    storage = "local-zfs"
    size    = var.disk_size
    type    = "scsi"
  }

  network {
    model  = "virtio"
    bridge = "vmbr0"
  }

  # Cloud-init configuration
  os_type    = "cloud-init"
  ipconfig0  = "ip=${var.ip_address}/24,gw=${var.gateway}"
  ciuser     = "commander"
  sshkeys    = var.ssh_keys

  lifecycle {
    ignore_changes = [
      network,
    ]
  }

  tags = "terraform,${var.vm_name}"
}

output "vm_ip" {
  value = var.ip_address
}

output "vm_id" {
  value = proxmox_vm_qemu.vm.vmid
}
terraform cloudflare-tunnel/main.tf

Cloudflare Tunnel Configuration

Terraform module for Cloudflare Tunnel and DNS records

hcl
# Terraform: Cloudflare Tunnel
# Manages tunnel configuration and DNS records

terraform {
  required_providers {
    cloudflare = {
      source  = "cloudflare/cloudflare"
      version = "~> 4.0"
    }
  }
}

variable "cloudflare_account_id" {
  description = "Cloudflare account ID"
  type        = string
  sensitive   = true
}

variable "cloudflare_zone_id" {
  description = "Cloudflare zone ID for your domain"
  type        = string
}

variable "domain" {
  description = "Base domain name"
  default     = "argobox.com"
}

variable "tunnel_secret" {
  description = "Tunnel secret (base64)"
  type        = string
  sensitive   = true
}

variable "services" {
  description = "Services to expose through tunnel"
  type = list(object({
    subdomain = string
    service   = string
    port      = number
  }))
  default = [
    { subdomain = "git", service = "localhost", port = 3000 },
    { subdomain = "ai", service = "localhost", port = 30000 },
    { subdomain = "vault", service = "localhost", port = 31745 },
  ]
}

# Create the tunnel
resource "cloudflare_tunnel" "homelab" {
  account_id = var.cloudflare_account_id
  name       = "homelab-tunnel"
  secret     = var.tunnel_secret
}

# Configure tunnel routes
resource "cloudflare_tunnel_config" "homelab" {
  account_id = var.cloudflare_account_id
  tunnel_id  = cloudflare_tunnel.homelab.id

  config {
    dynamic "ingress_rule" {
      for_each = var.services
      content {
        hostname = "${ingress_rule.value.subdomain}.${var.domain}"
        service  = "http://${ingress_rule.value.service}:${ingress_rule.value.port}"
      }
    }

    # Catch-all rule (required)
    ingress_rule {
      service = "http_status:404"
    }
  }
}

# Create DNS records pointing to tunnel
resource "cloudflare_record" "tunnel_dns" {
  for_each = { for s in var.services : s.subdomain => s }

  zone_id = var.cloudflare_zone_id
  name    = each.value.subdomain
  value   = "${cloudflare_tunnel.homelab.id}.cfargotunnel.com"
  type    = "CNAME"
  proxied = true
}

output "tunnel_id" {
  value = cloudflare_tunnel.homelab.id
}

output "tunnel_token" {
  value     = cloudflare_tunnel.homelab.tunnel_token
  sensitive = true
}
shell backup.sh

Restic + Rsync Hybrid Backup

Versioned backups with Restic to SFTP/S3 remote targets, fast local snapshots via rsync, pre/post hooks for containers and databases, retention policy enforcement, and integrity checking

bash
#!/bin/bash
# ===================================================================
# Restic + Rsync Hybrid Backup Script
# -------------------------------------------------------------------
# Restic: versioned, deduplicated backups to a remote SFTP or S3 repo
# Rsync:  fast local snapshots for quick restores
#
# Cron example (daily at 2 AM):
#   0 2 * * * /usr/local/bin/backup.sh >> /var/log/backup.log 2>&1
# ===================================================================

set -Euo pipefail

# ---------------------------------------------------------------
# Lock file -- prevent concurrent runs
# ---------------------------------------------------------------
LOCKFILE="/var/run/backup-homelab.lock"
exec 200>"${LOCKFILE}"
if ! flock -n 200; then
    echo "[ERROR] Another backup is already running (lockfile: ${LOCKFILE}). Exiting."
    exit 1
fi

# ---------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------
HOSTNAME_SHORT=$(hostname -s)
LOG_FILE="/var/log/backup.log"
NTFY_URL="https://ntfy.sh/argobox-backups"

# Restic repository -- SFTP target on NAS at 10.42.0.50
# For S3: export RESTIC_REPOSITORY="s3:https://s3.example.com/backups"
export RESTIC_REPOSITORY="sftp:[email protected]:/volume1/backups/restic/${HOSTNAME_SHORT}"
export RESTIC_PASSWORD_FILE="/etc/backup/restic-password"

# Rsync local snapshot destination
LOCAL_SNAPSHOT_DIR="/mnt/backups/${HOSTNAME_SHORT}/latest"

# Directories to back up (use arrays -- no eval needed)
BACKUP_PATHS=(
    "/etc"
    "/home/commander"
    "/opt/docker"
    "/opt/stacks"
    "/var/lib/docker/volumes"
)

# Paths to exclude
EXCLUDE_PATTERNS=(
    "*.tmp"
    "*.cache"
    "**/node_modules"
    "**/.git"
    "**/__pycache__"
    "**/*.log"
    "**/lost+found"
)

# Containers to stop before backup (space-separated)
STOP_CONTAINERS=("postgres" "mariadb")

# Databases to dump
POSTGRES_CONTAINER="postgres"
POSTGRES_DATABASES=("gitea" "nextcloud")
DB_DUMP_DIR="/tmp/backup-db-dumps"

# Retention policy
KEEP_DAILY=7
KEEP_WEEKLY=4
KEEP_MONTHLY=6
KEEP_YEARLY=1

# ---------------------------------------------------------------
# Logging helper
# ---------------------------------------------------------------
log() {
    local level="$1"; shift
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] [${level}] $*" | tee -a "${LOG_FILE}"
}

# ---------------------------------------------------------------
# Notification helper -- send status to ntfy on completion
# ---------------------------------------------------------------
notify() {
    local title="$1"
    local message="$2"
    local priority="${3:-default}"
    curl -s -o /dev/null \
        -H "Title: ${title}" \
        -H "Priority: ${priority}" \
        -d "${message}" \
        "${NTFY_URL}" || true
}

# ---------------------------------------------------------------
# Error handler
# ---------------------------------------------------------------
BACKUP_STATUS="FAILED"
cleanup() {
    local exit_code=$?
    # Always restart containers, even on failure
    post_backup_hooks || true
    if [[ "${BACKUP_STATUS}" != "OK" ]]; then
        log "ERROR" "Backup failed with exit code ${exit_code}"
        notify "Backup FAILED [${HOSTNAME_SHORT}]" \
            "Backup failed at $(date). Check ${LOG_FILE} for details." \
            "urgent"
    fi
    rm -rf "${DB_DUMP_DIR}" 2>/dev/null || true
}
trap cleanup EXIT

# ---------------------------------------------------------------
# Pre-backup hooks: stop containers, dump databases
# ---------------------------------------------------------------
pre_backup_hooks() {
    log "INFO" "Running pre-backup hooks"

    # Stop listed containers gracefully
    for ctr in "${STOP_CONTAINERS[@]}"; do
        if docker ps --format '{{.Names}}' | grep -q "^${ctr}$"; then
            log "INFO" "Stopping container: ${ctr}"
            docker stop "${ctr}" --time 30
        fi
    done

    # Dump PostgreSQL databases
    mkdir -p "${DB_DUMP_DIR}"
    if docker ps -a --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then
        for db in "${POSTGRES_DATABASES[@]}"; do
            log "INFO" "Dumping PostgreSQL database: ${db}"
            docker exec "${POSTGRES_CONTAINER}" \
                pg_dump -U postgres -Fc "${db}" > "${DB_DUMP_DIR}/${db}.dump" \
                || log "WARN" "Failed to dump database: ${db}"
        done
    fi
}

# ---------------------------------------------------------------
# Post-backup hooks: restart containers, verify
# ---------------------------------------------------------------
post_backup_hooks() {
    log "INFO" "Running post-backup hooks"
    for ctr in "${STOP_CONTAINERS[@]}"; do
        if docker ps -a --format '{{.Names}}' | grep -q "^${ctr}$"; then
            log "INFO" "Starting container: ${ctr}"
            docker start "${ctr}"
        fi
    done
}

# ---------------------------------------------------------------
# Build exclude arguments for restic and rsync
# ---------------------------------------------------------------
RESTIC_EXCLUDES=()
RSYNC_EXCLUDES=()
for pattern in "${EXCLUDE_PATTERNS[@]}"; do
    RESTIC_EXCLUDES+=("--exclude" "${pattern}")
    RSYNC_EXCLUDES+=("--exclude" "${pattern}")
done

# ---------------------------------------------------------------
# Main backup sequence
# ---------------------------------------------------------------
log "INFO" "====== Backup started for ${HOSTNAME_SHORT} ======"

# 1. Pre-backup hooks (stop containers, dump DBs)
pre_backup_hooks

# 2. Include database dumps in backup paths
if [[ -d "${DB_DUMP_DIR}" ]] && ls "${DB_DUMP_DIR}"/*.dump &>/dev/null; then
    BACKUP_PATHS+=("${DB_DUMP_DIR}")
fi

# 3. Initialize restic repo if it does not exist
if ! restic snapshots --quiet &>/dev/null; then
    log "INFO" "Initializing new restic repository"
    restic init
fi

# 4. Restic backup -- versioned, deduplicated
log "INFO" "Starting restic backup to ${RESTIC_REPOSITORY}"
restic backup \
    "${RESTIC_EXCLUDES[@]}" \
    --tag "${HOSTNAME_SHORT}" \
    --tag "scheduled" \
    --verbose \
    "${BACKUP_PATHS[@]}" 2>&1 | tee -a "${LOG_FILE}"

# 5. Rsync local snapshot -- fast local copy for quick restores
log "INFO" "Rsync local snapshot to ${LOCAL_SNAPSHOT_DIR}"
mkdir -p "${LOCAL_SNAPSHOT_DIR}"
for src in "${BACKUP_PATHS[@]}"; do
    if [[ -d "${src}" ]]; then
        dest_subdir="${LOCAL_SNAPSHOT_DIR}${src}"
        mkdir -p "${dest_subdir}"
        rsync -a --delete "${RSYNC_EXCLUDES[@]}" "${src}/" "${dest_subdir}/" \
            2>&1 | tee -a "${LOG_FILE}"
    else
        log "WARN" "Source path does not exist, skipping: ${src}"
    fi
done

# 6. Post-backup hooks (restart containers)
post_backup_hooks

# 7. Enforce retention policy
log "INFO" "Enforcing retention policy"
restic forget \
    --keep-daily ${KEEP_DAILY} \
    --keep-weekly ${KEEP_WEEKLY} \
    --keep-monthly ${KEEP_MONTHLY} \
    --keep-yearly ${KEEP_YEARLY} \
    --prune \
    --tag "${HOSTNAME_SHORT}" 2>&1 | tee -a "${LOG_FILE}"

# 8. Integrity check (run weekly -- check day-of-week)
DOW=$(date +%u)
if [[ "${DOW}" -eq 7 ]]; then
    log "INFO" "Running weekly integrity check"
    restic check --read-data-subset=5% 2>&1 | tee -a "${LOG_FILE}"
fi

# 9. Report
SNAPSHOT_COUNT=$(restic snapshots --tag "${HOSTNAME_SHORT}" --json | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "?")
LOCAL_SIZE=$(du -sh "${LOCAL_SNAPSHOT_DIR}" 2>/dev/null | cut -f1 || echo "?")

log "INFO" "Backup complete. Restic snapshots: ${SNAPSHOT_COUNT}. Local snapshot size: ${LOCAL_SIZE}"

BACKUP_STATUS="OK"
notify "Backup OK [${HOSTNAME_SHORT}]" \
    "Restic snapshots: ${SNAPSHOT_COUNT}. Local: ${LOCAL_SIZE}. Duration: ${SECONDS}s."

log "INFO" "====== Backup finished in ${SECONDS}s ======"
shell docker-maintenance.sh

Docker Maintenance Script

Comprehensive Docker maintenance with selective image cleanup (7-day retention), safe volume pruning, builder cache limits, container health checks, log rotation verification, and ntfy notifications

bash
#!/bin/bash
# ===================================================================
# Docker Maintenance Script
# -------------------------------------------------------------------
# Performs selective cleanup of Docker resources with safety checks.
# Designed to run weekly via cron or systemd timer.
#
# Cron: 0 3 * * 0 /usr/local/bin/docker-maintenance.sh
# ===================================================================

set -Euo pipefail

# ---------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------
LOG_FILE="/var/log/docker-maintenance.log"
NTFY_URL="https://ntfy.sh/argobox-docker"
IMAGE_RETAIN_DAYS=7           # Keep images used within this window
BUILDER_CACHE_LIMIT="5GB"     # Max builder cache to retain
LOGROTATE_CONF="/etc/logrotate.d/docker-containers"

# ---------------------------------------------------------------
# Logging
# ---------------------------------------------------------------
ERRORS=0
log() {
    local level="$1"; shift
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] [${level}] $*" | tee -a "${LOG_FILE}"
}

# ---------------------------------------------------------------
# Error handling -- count errors but continue
# ---------------------------------------------------------------
on_error() {
    ERRORS=$((ERRORS + 1))
    log "ERROR" "Command failed on line ${BASH_LINENO[0]}"
}
trap on_error ERR

# ---------------------------------------------------------------
# Notification
# ---------------------------------------------------------------
notify() {
    local title="$1"; local body="$2"; local priority="${3:-default}"
    curl -s -o /dev/null \
        -H "Title: ${title}" \
        -H "Priority: ${priority}" \
        -d "${body}" \
        "${NTFY_URL}" || true
}

# ===================================================================
# Phase 1: Pre-cleanup disk usage report
# ===================================================================
log "INFO" "====== Docker Maintenance Started ======"
log "INFO" "--- Disk usage BEFORE cleanup ---"
docker system df -v 2>&1 | tee -a "${LOG_FILE}"
DISK_BEFORE=$(docker system df --format '{{.Size}}' | head -1)

# ===================================================================
# Phase 2: Container health check
# ===================================================================
log "INFO" "--- Container health check ---"
UNHEALTHY=$(docker ps --filter "health=unhealthy" --format '{{.Names}} ({{.Image}}) status={{.Status}}' 2>/dev/null || true)
if [[ -n "${UNHEALTHY}" ]]; then
    log "WARN" "Unhealthy containers detected:"
    while IFS= read -r line; do
        log "WARN" "  ${line}"
    done <<< "${UNHEALTHY}"
else
    log "INFO" "All running containers are healthy"
fi

# Report containers in restart loops (restarting status)
RESTARTING=$(docker ps --filter "status=restarting" --format '{{.Names}}' 2>/dev/null || true)
if [[ -n "${RESTARTING}" ]]; then
    log "WARN" "Containers in restart loop: ${RESTARTING}"
fi

# ===================================================================
# Phase 3: Remove exited and dead containers
# ===================================================================
log "INFO" "--- Removing exited/dead containers ---"
DEAD_CONTAINERS=$(docker ps -aq --filter "status=exited" --filter "status=dead" 2>/dev/null || true)
if [[ -n "${DEAD_CONTAINERS}" ]]; then
    DEAD_COUNT=$(echo "${DEAD_CONTAINERS}" | wc -w)
    log "INFO" "Removing ${DEAD_COUNT} exited/dead containers"
    echo "${DEAD_CONTAINERS}" | xargs docker rm 2>&1 | tee -a "${LOG_FILE}"
else
    log "INFO" "No exited/dead containers to remove"
fi

# ===================================================================
# Phase 4: Selective image cleanup
# Keep images used by running containers and images pulled/used
# within the last IMAGE_RETAIN_DAYS days. Remove the rest.
# ===================================================================
log "INFO" "--- Selective image cleanup (retain ${IMAGE_RETAIN_DAYS}-day window) ---"

# Collect images used by running containers (never remove these)
RUNNING_IMAGES=$(docker ps --format '{{.Image}}' | sort -u)

# Collect all image IDs
ALL_IMAGES=$(docker images --format '{{.ID}}|{{.Repository}}:{{.Tag}}|{{.CreatedAt}}' 2>/dev/null || true)

REMOVED_IMAGES=0
CUTOFF_TS=$(date -d "-${IMAGE_RETAIN_DAYS} days" +%s 2>/dev/null || date -v-${IMAGE_RETAIN_DAYS}d +%s)

while IFS='|' read -r img_id img_name img_created; do
    [[ -z "${img_id}" ]] && continue

    # Skip images used by running containers
    if echo "${RUNNING_IMAGES}" | grep -q "${img_name}"; then
        continue
    fi

    # Skip images with <none> tag only if they are dangling (handled separately)
    # Check creation date
    img_ts=$(date -d "${img_created}" +%s 2>/dev/null || echo "0")
    if [[ "${img_ts}" -lt "${CUTOFF_TS}" ]]; then
        log "INFO" "Removing old image: ${img_name} (created: ${img_created})"
        docker rmi "${img_id}" 2>/dev/null && REMOVED_IMAGES=$((REMOVED_IMAGES + 1)) || true
    fi
done <<< "${ALL_IMAGES}"

log "INFO" "Removed ${REMOVED_IMAGES} images older than ${IMAGE_RETAIN_DAYS} days"

# Also remove dangling (untagged) images
DANGLING_COUNT=$(docker images -f "dangling=true" -q | wc -l)
if [[ "${DANGLING_COUNT}" -gt 0 ]]; then
    log "INFO" "Removing ${DANGLING_COUNT} dangling images"
    docker image prune -f 2>&1 | tee -a "${LOG_FILE}"
fi

# ===================================================================
# Phase 5: Volume cleanup (safe -- only anonymous volumes)
# ===================================================================
log "INFO" "--- Volume cleanup ---"

# List named volumes that are NOT in use (warn but do not remove)
UNUSED_NAMED=$(docker volume ls --filter "dangling=true" --format '{{.Name}}' | grep -v '^[0-9a-f]\{64\}$' || true)
if [[ -n "${UNUSED_NAMED}" ]]; then
    log "WARN" "Unused NAMED volumes detected (not removing -- review manually):"
    while IFS= read -r vol; do
        log "WARN" "  ${vol}"
    done <<< "${UNUSED_NAMED}"
fi

# Only prune anonymous volumes (64-char hex names)
ANON_VOLUMES=$(docker volume ls --filter "dangling=true" --format '{{.Name}}' | grep '^[0-9a-f]\{64\}$' || true)
if [[ -n "${ANON_VOLUMES}" ]]; then
    ANON_COUNT=$(echo "${ANON_VOLUMES}" | wc -l)
    log "INFO" "Removing ${ANON_COUNT} anonymous dangling volumes"
    echo "${ANON_VOLUMES}" | xargs -r docker volume rm 2>&1 | tee -a "${LOG_FILE}"
else
    log "INFO" "No anonymous dangling volumes"
fi

# ===================================================================
# Phase 6: Network cleanup
# ===================================================================
log "INFO" "--- Network cleanup ---"
docker network prune -f 2>&1 | tee -a "${LOG_FILE}"

# ===================================================================
# Phase 7: Builder cache pruning with size limit
# ===================================================================
log "INFO" "--- Builder cache prune (keep ${BUILDER_CACHE_LIMIT}) ---"
docker builder prune -f --keep-storage "${BUILDER_CACHE_LIMIT}" 2>&1 | tee -a "${LOG_FILE}"

# ===================================================================
# Phase 8: Container log rotation check
# ===================================================================
log "INFO" "--- Log rotation verification ---"
if [[ -f "${LOGROTATE_CONF}" ]]; then
    log "INFO" "Logrotate config exists at ${LOGROTATE_CONF}"
    # Dry-run to verify config is valid
    if logrotate -d "${LOGROTATE_CONF}" &>/dev/null; then
        log "INFO" "Logrotate config is valid"
    else
        log "WARN" "Logrotate config has issues -- check ${LOGROTATE_CONF}"
    fi
else
    log "WARN" "No logrotate config found at ${LOGROTATE_CONF}"
    log "WARN" "Container logs may grow unbounded. Create a logrotate config or"
    log "WARN" "set log-opts in /etc/docker/daemon.json: max-size=10m, max-file=3"
fi

# Check for oversized container logs (>100MB)
LARGE_LOGS=$(find /var/lib/docker/containers/ -name "*-json.log" -size +100M 2>/dev/null || true)
if [[ -n "${LARGE_LOGS}" ]]; then
    log "WARN" "Container logs exceeding 100MB:"
    while IFS= read -r logfile; do
        SIZE=$(du -sh "${logfile}" | cut -f1)
        CONTAINER_ID=$(basename "$(dirname "${logfile}")")
        CONTAINER_NAME=$(docker inspect --format '{{.Name}}' "${CONTAINER_ID}" 2>/dev/null | sed 's|^/||' || echo "unknown")
        log "WARN" "  ${CONTAINER_NAME}: ${SIZE} (${logfile})"
    done <<< "${LARGE_LOGS}"
fi

# ===================================================================
# Phase 9: Post-cleanup disk usage report
# ===================================================================
log "INFO" "--- Disk usage AFTER cleanup ---"
docker system df -v 2>&1 | tee -a "${LOG_FILE}"
DISK_AFTER=$(docker system df --format '{{.Size}}' | head -1)

# ===================================================================
# Phase 10: Summary and notification
# ===================================================================
SUMMARY="Docker maintenance complete on $(hostname -s).
Before: ${DISK_BEFORE}. After: ${DISK_AFTER}.
Images removed: ${REMOVED_IMAGES}. Errors: ${ERRORS}."

if [[ -n "${UNHEALTHY}" ]]; then
    SUMMARY+=$'\nUnhealthy containers detected -- check logs.'
fi

log "INFO" "${SUMMARY}"

if [[ "${ERRORS}" -gt 0 ]]; then
    notify "Docker Maintenance [$(hostname -s)] - ${ERRORS} errors" \
        "${SUMMARY}" "high"
    exit 1
else
    notify "Docker Maintenance [$(hostname -s)] - OK" "${SUMMARY}"
fi

log "INFO" "====== Docker Maintenance Finished ======"
systemd openrc-service-template

OpenRC Service Template

Template for creating OpenRC init scripts (Gentoo/Alpine)

bash
#!/sbin/openrc-run
# OpenRC Service Template
# Place in /etc/init.d/ and chmod +x
# Enable with: rc-update add servicename default

name="myservice"
description="My Custom Service"

# Service configuration
command="/usr/local/bin/myservice"
command_args="--config /etc/myservice/config.yaml"
command_user="commander"
command_group="commander"
command_background=true

# PID file location
pidfile="/run/${RC_SVCNAME}.pid"

# Log configuration
output_log="/var/log/${RC_SVCNAME}.log"
error_log="/var/log/${RC_SVCNAME}.err"

# Dependencies
depend() {
    need net
    after firewall
    use dns logger
}

# Pre-start checks
start_pre() {
    checkpath --directory --owner ${command_user}:${command_group} --mode 0755 /var/lib/myservice
    checkpath --file --owner ${command_user}:${command_group} --mode 0640 /etc/myservice/config.yaml
}

# Custom start function (optional)
start() {
    ebegin "Starting ${name}"
    start-stop-daemon --start \
        --exec ${command} \
        --user ${command_user} \
        --group ${command_group} \
        --background \
        --make-pidfile \
        --pidfile ${pidfile} \
        --stdout ${output_log} \
        --stderr ${error_log} \
        -- ${command_args}
    eend $?
}

# Custom stop function (optional)
stop() {
    ebegin "Stopping ${name}"
    start-stop-daemon --stop \
        --exec ${command} \
        --pidfile ${pidfile}
    eend $?
}

# Status check
status() {
    if [ -f "${pidfile}" ]; then
        if kill -0 $(cat ${pidfile}) 2>/dev/null; then
            einfo "${name} is running (PID: $(cat ${pidfile}))"
            return 0
        fi
    fi
    einfo "${name} is not running"
    return 3
}
systemd backup.timer

Systemd Timer Template

Systemd service and timer for scheduled tasks

ini
# Systemd Timer: backup.timer
# Place in /etc/systemd/system/
# Enable with: systemctl enable --now backup.timer

# === backup.service ===
# [Unit]
# Description=Automated Backup Service
# After=network-online.target
# Wants=network-online.target
#
# [Service]
# Type=oneshot
# ExecStart=/usr/local/bin/backup.sh
# User=root
# StandardOutput=journal
# StandardError=journal
#
# [Install]
# WantedBy=multi-user.target

# === backup.timer ===
[Unit]
Description=Run backup daily at 3 AM

[Timer]
# Run at 3:00 AM every day
OnCalendar=*-*-* 03:00:00

# Add randomized delay up to 15 minutes
RandomizedDelaySec=900

# Run immediately if we missed the last scheduled time
Persistent=true

# Don't run if system was just booted
OnBootSec=5min

[Install]
WantedBy=timers.target

# === Useful timer expressions ===
# OnCalendar=hourly           # Every hour
# OnCalendar=daily            # Every day at midnight
# OnCalendar=weekly           # Every Monday at midnight
# OnCalendar=*-*-* 04:00:00   # Every day at 4 AM
# OnCalendar=Mon *-*-* 02:00  # Every Monday at 2 AM
# OnCalendar=*-*-01 00:00:00  # First of every month

# === Commands ===
# systemctl list-timers              # List all timers
# systemctl status backup.timer      # Check timer status
# systemctl start backup.service     # Run manually
# journalctl -u backup.service       # View logs
shell zfs-snapshot.sh

ZFS Snapshot Management

Automated ZFS snapshots with retention policy

bash
#!/bin/bash
# ZFS Snapshot Management Script
# Creates snapshots with automatic rotation

set -euo pipefail

# Configuration
POOL="tank-storage"
SNAPSHOT_PREFIX="auto"
HOURLY_KEEP=24
DAILY_KEEP=30
WEEKLY_KEEP=12

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# Create snapshot
create_snapshot() {
    local snap_type="$1"
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local snap_name="${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_${timestamp}"

    log "Creating snapshot: ${snap_name}"
    zfs snapshot -r "${snap_name}"
}

# List snapshots by type
list_snapshots() {
    local snap_type="$1"
    zfs list -t snapshot -o name -s creation | grep "${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_" || true
}

# Delete old snapshots
cleanup_snapshots() {
    local snap_type="$1"
    local keep="$2"

    local snapshots=($(list_snapshots "${snap_type}"))
    local count=${#snapshots[@]}

    if (( count > keep )); then
        local to_delete=$((count - keep))
        log "Cleaning up ${to_delete} old ${snap_type} snapshots"

        for ((i=0; i<to_delete; i++)); do
            log "Deleting: ${snapshots[i]}"
            zfs destroy -r "${snapshots[i]}"
        done
    else
        log "No ${snap_type} snapshots to clean (have ${count}, keep ${keep})"
    fi
}

# Main logic based on argument
case "${1:-hourly}" in
    hourly)
        create_snapshot "hourly"
        cleanup_snapshots "hourly" $HOURLY_KEEP
        ;;
    daily)
        create_snapshot "daily"
        cleanup_snapshots "daily" $DAILY_KEEP
        ;;
    weekly)
        create_snapshot "weekly"
        cleanup_snapshots "weekly" $WEEKLY_KEEP
        ;;
    list)
        echo "=== Hourly Snapshots ==="
        list_snapshots "hourly"
        echo
        echo "=== Daily Snapshots ==="
        list_snapshots "daily"
        echo
        echo "=== Weekly Snapshots ==="
        list_snapshots "weekly"
        ;;
    status)
        echo "=== ZFS Pool Status ==="
        zpool status $POOL
        echo
        echo "=== Snapshot Counts ==="
        echo "Hourly: $(list_snapshots hourly | wc -l) / $HOURLY_KEEP"
        echo "Daily:  $(list_snapshots daily | wc -l) / $DAILY_KEEP"
        echo "Weekly: $(list_snapshots weekly | wc -l) / $WEEKLY_KEEP"
        ;;
    *)
        echo "Usage: $0 {hourly|daily|weekly|list|status}"
        exit 1
        ;;
esac

log "Done"
ansible roles/docker-host/

Ansible Role Structure

Complete Ansible role for Docker host provisioning showing proper directory layout with tasks, handlers, defaults, vars, meta, and templates. Initialize with: ansible-galaxy init roles/docker-host

yaml
# ===========================================================
# Ansible Role: docker-host
# ===========================================================
# Directory structure:
#   roles/docker-host/
#   ├── tasks/main.yml        # Core task logic
#   ├── handlers/main.yml     # Service restart handlers
#   ├── templates/            # Jinja2 templates
#   │   └── daemon.json.j2    # Docker daemon config
#   ├── defaults/main.yml     # Default variables (overridable)
#   ├── vars/main.yml         # Role-internal variables
#   └── meta/main.yml         # Galaxy metadata + dependencies
#
# Create scaffold: ansible-galaxy init roles/docker-host
# Usage in playbook:
#   - hosts: docker_hosts
#     roles:
#       - role: docker-host
#         docker_log_max_size: "50m"
# ===========================================================

# ===================== defaults/main.yml ====================
# These variables can be overridden in playbooks, group_vars,
# or host_vars. They define sane defaults for all Docker hosts.
# ============================================================
# docker_edition: "ce"
# docker_version: "5:26.1.4-1~ubuntu.22.04~jammy"
# docker_compose_version: "2.27.0"
#
# # Daemon configuration defaults
# docker_log_driver: "json-file"
# docker_log_max_size: "10m"
# docker_log_max_file: "3"
# docker_storage_driver: "overlay2"
# docker_live_restore: true
# docker_default_address_pool_base: "172.17.0.0/12"
# docker_default_address_pool_size: 24
#
# # User to add to the docker group
# docker_users:
#   - commander
#
# # Data root (set to a dedicated disk/partition if available)
# docker_data_root: "/var/lib/docker"

# ===================== vars/main.yml ========================
# Internal variables -- not intended to be overridden.
# ============================================================
# docker_apt_key_url: "https://download.docker.com/linux/ubuntu/gpg"
# docker_apt_repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
# docker_packages:
#   - "docker-{{ docker_edition }}"
#   - "docker-{{ docker_edition }}-cli"
#   - "containerd.io"
#   - "docker-buildx-plugin"
#   - "docker-compose-plugin"

# ===================== meta/main.yml ========================
# Galaxy metadata and role dependencies.
# ============================================================
# galaxy_info:
#   author: commander
#   description: "Provision and configure Docker CE hosts"
#   company: ArgoBox Homelab
#   license: MIT
#   min_ansible_version: "2.15"
#   platforms:
#     - name: Ubuntu
#       versions:
#         - jammy
#         - noble
#     - name: Debian
#       versions:
#         - bookworm
#   galaxy_tags:
#     - docker
#     - containers
#     - homelab
#
# dependencies:
#   # Run common role first (sets up users, SSH keys, base packages)
#   - role: common

# ===================== handlers/main.yml ====================
# Handlers are triggered by 'notify' in tasks. They run once
# at the end of the play, even if notified multiple times.
# ============================================================

---
# handlers/main.yml
- name: Restart docker
  ansible.builtin.service:
    name: docker
    state: restarted
  listen: "restart docker"

- name: Reload docker daemon
  ansible.builtin.systemd:
    name: docker
    state: reloaded
    daemon_reload: yes
  listen: "reload docker"

# ===================== tasks/main.yml =======================
# Main task file -- installs Docker, deploys daemon config,
# adds users to docker group, starts the service.
# ============================================================

# tasks/main.yml
- name: Install prerequisites
  ansible.builtin.apt:
    name:
      - apt-transport-https
      - ca-certificates
      - curl
      - gnupg
      - lsb-release
      - python3-docker
    state: present
    update_cache: yes
  tags: [docker, packages]

- name: Add Docker GPG key
  ansible.builtin.apt_key:
    url: "{{ docker_apt_key_url }}"
    state: present
  tags: [docker, repo]

- name: Add Docker APT repository
  ansible.builtin.apt_repository:
    repo: "{{ docker_apt_repo }}"
    state: present
    filename: docker
  tags: [docker, repo]

- name: Install Docker packages
  ansible.builtin.apt:
    name: "{{ docker_packages }}"
    state: present
    update_cache: yes
  tags: [docker, packages]

- name: Create Docker config directory
  ansible.builtin.file:
    path: /etc/docker
    state: directory
    owner: root
    group: root
    mode: "0755"
  tags: [docker, config]

- name: Deploy daemon.json from template
  ansible.builtin.template:
    src: daemon.json.j2
    dest: /etc/docker/daemon.json
    owner: root
    group: root
    mode: "0644"
  notify: restart docker
  tags: [docker, config]
  # --- Template: templates/daemon.json.j2 ---
  # {
  #   "log-driver": "{{ docker_log_driver }}",
  #   "log-opts": {
  #     "max-size": "{{ docker_log_max_size }}",
  #     "max-file": "{{ docker_log_max_file }}"
  #   },
  #   "storage-driver": "{{ docker_storage_driver }}",
  #   "live-restore": {{ docker_live_restore | lower }},
  #   "data-root": "{{ docker_data_root }}",
  #   "default-address-pools": [
  #     {
  #       "base": "{{ docker_default_address_pool_base }}",
  #       "size": {{ docker_default_address_pool_size }}
  #     }
  #   ],
  #   "features": {
  #     "buildkit": true
  #   }
  # }

- name: Add users to docker group
  ansible.builtin.user:
    name: "{{ item }}"
    groups: docker
    append: yes
  loop: "{{ docker_users }}"
  tags: [docker, users]

- name: Enable and start Docker service
  ansible.builtin.service:
    name: docker
    state: started
    enabled: yes
  tags: [docker, service]

- name: Verify Docker is running
  ansible.builtin.command: docker info
  changed_when: false
  register: docker_info
  tags: [docker, verify]

- name: Show Docker version
  ansible.builtin.debug:
    msg: "Docker {{ docker_info.stdout_lines[1] | trim }} installed on {{ inventory_hostname }}"
  tags: [docker, verify]

# ===================== Playbook using this role =============
# site.yml or docker-hosts.yml:
#
# ---
# - name: Provision Docker hosts
#   hosts: docker_hosts
#   become: yes
#   roles:
#     - role: docker-host
#       docker_log_max_size: "50m"
#       docker_users:
#         - commander
#         - deploy
ansible inventory/

Ansible Inventory + Group Vars

Production Ansible inventory with host groups, group_vars, host_vars, and ansible.cfg using ArgoBox star-themed naming and 10.42.0.x addressing

yaml
# ===========================================================
# Ansible Inventory + Group Vars
# ===========================================================
# Directory layout:
#   inventory/
#   ├── hosts.yml                    # Host inventory
#   ├── group_vars/
#   │   ├── all.yml                  # Variables for every host
#   │   ├── docker_hosts.yml         # Docker-specific vars
#   │   └── k3s_servers.yml          # K3s server vars
#   └── host_vars/
#       └── izar.yml                 # Per-host overrides
#
#   ansible.cfg                      # Project-level config
# ===========================================================

# ==================== inventory/hosts.yml ===================
# All IPs use 10.42.0.x format. Hostnames are star-themed.
# ============================================================
---
all:
  children:
    # ---------------------------------------------------------
    # Proxmox hypervisors
    # ---------------------------------------------------------
    proxmox:
      hosts:
        izar-host:
          ansible_host: 10.42.0.201
          proxmox_node: izar
        arcturus-host:
          ansible_host: 10.42.0.100
          proxmox_node: arcturus

    # ---------------------------------------------------------
    # Docker container hosts
    # ---------------------------------------------------------
    docker_hosts:
      hosts:
        altair-link:
          ansible_host: 10.42.0.199
          docker_data_root: /opt/docker
        capella-outpost:
          ansible_host: 10.42.0.10
          docker_data_root: /var/lib/docker

    # ---------------------------------------------------------
    # K3s cluster -- server (control plane) nodes
    # ---------------------------------------------------------
    k3s_servers:
      hosts:
        altair-link:
          ansible_host: 10.42.0.199
          k3s_role: server
          k3s_init: true

    # ---------------------------------------------------------
    # K3s cluster -- agent (worker) nodes
    # ---------------------------------------------------------
    k3s_agents:
      hosts:
        tau-host:
          ansible_host: 10.42.0.175
          k3s_role: agent
        sirius-station:
          ansible_host: 10.42.0.50
          k3s_role: agent

    # ---------------------------------------------------------
    # Monitoring servers
    # ---------------------------------------------------------
    monitoring_servers:
      hosts:
        altair-link:
          ansible_host: 10.42.0.199

# ============== inventory/group_vars/all.yml ================
# Shared variables applied to every host in the inventory.
# ============================================================

# --- all.yml ---
# ansible_user: commander
# ansible_become: true
# ansible_python_interpreter: /usr/bin/python3
#
# # DNS (local Pi-hole + fallback)
# dns_servers:
#   - 10.42.0.1
#   - 1.1.1.1
#
# # NTP
# ntp_servers:
#   - 0.pool.ntp.org
#   - 1.pool.ntp.org
#
# # Timezone
# timezone: "America/Chicago"
#
# # Admin user for all hosts
# admin_user: commander
# admin_ssh_key: "ssh-ed25519 AAAAC3NzaC1lZDI1... commander@capella-outpost"
#
# # Default domain
# domain: "argobox.local"
#
# # Notification endpoint
# ntfy_url: "https://ntfy.sh/argobox-alerts"

# ========== inventory/group_vars/docker_hosts.yml ===========
# Variables specific to hosts running Docker.
# ============================================================

# --- docker_hosts.yml ---
# docker_log_max_size: "10m"
# docker_log_max_file: "3"
# docker_storage_driver: "overlay2"
# docker_live_restore: true
# docker_default_address_pool_base: "172.17.0.0/12"
# docker_default_address_pool_size: 24
#
# # Compose stacks to deploy
# docker_stacks_dir: /opt/stacks
# docker_stacks:
#   - portainer
#   - traefik
#   - monitoring

# ============ inventory/host_vars/izar.yml ==================
# Per-host overrides for izar-host (primary Proxmox node).
# ============================================================

# --- izar.yml ---
# proxmox_api_url: "https://10.42.0.201:8006/api2/json"
# proxmox_storage: "local-zfs"
# proxmox_backup_storage: "pbs-local"
# proxmox_vlan_aware: true
#
# # Izar has 64GB RAM -- allow larger VMs
# proxmox_default_memory: 8192
# proxmox_default_cores: 4
#
# # ZFS pool for VM storage
# zfs_pool: "rpool"

# ==================== ansible.cfg ===========================
# Project-level Ansible configuration. Place in the repo root.
# ============================================================

# [defaults]
# inventory = inventory/hosts.yml
# roles_path = roles
# vault_password_file = .vault-password
# stdout_callback = yaml
# host_key_checking = false
# retry_files_enabled = false
# gathering = smart
# fact_caching = jsonfile
# fact_caching_connection = /tmp/ansible-facts
# fact_caching_timeout = 3600
#
# [privilege_escalation]
# become = true
# become_method = sudo
# become_ask_pass = false
#
# [ssh_connection]
# pipelining = true
# ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ForwardAgent=no
ansible vault-example.yml

Ansible Vault Usage

Working examples of Ansible Vault for encrypting secrets: vault files, inline encrypted strings, vault in CI/CD, and a playbook deploying Docker stacks with vault-stored credentials

yaml
# ===========================================================
# Ansible Vault Usage Examples
# ===========================================================
# Vault encrypts sensitive data so it can live in version control.
#
# Create encrypted file:
#   ansible-vault create inventory/group_vars/vault.yml
#
# Edit encrypted file:
#   ansible-vault edit inventory/group_vars/vault.yml
#
# Encrypt a single string:
#   ansible-vault encrypt_string 'ExampleP@ss123!' --name 'db_password'
#
# Run playbook with vault:
#   ansible-playbook -i inventory site.yml --ask-vault-pass
#   ansible-playbook -i inventory site.yml --vault-password-file .vault-password
# ===========================================================

# ========= inventory/group_vars/vault.yml (encrypted) =======
# This file is encrypted at rest. Contents shown here for
# reference -- the actual file is AES-256 encrypted.
#
# Create with: ansible-vault create inventory/group_vars/vault.yml
# ============================================================

# --- vault.yml (plaintext contents before encryption) ---
# vault_db_root_password: "P@ssw0rd-CHANGE-ME"
# vault_db_gitea_password: "P@ssw0rd-CHANGE-ME"
# vault_db_nextcloud_password: "P@ssw0rd-CHANGE-ME"
# vault_grafana_admin_password: "P@ssw0rd-CHANGE-ME"
# vault_restic_repo_password: "P@ssw0rd-CHANGE-ME"
# vault_smtp_password: "P@ssw0rd-CHANGE-ME"
# vault_cloudflare_api_token: "cf-token-CHANGE-ME"
# vault_tailscale_authkey: "tskey-auth-CHANGE-ME"

# ========= Inline encrypted strings =========================
# For mixing encrypted and plaintext vars in the same file,
# use !vault | with the encrypted block from encrypt_string.
# ============================================================

# --- inventory/group_vars/docker_hosts.yml ---
# docker_registry_user: commander
# docker_registry_password: !vault |
#   $ANSIBLE_VAULT;1.1;AES256
#   61626364656667686970717273747576
#   ... (encrypted block from ansible-vault encrypt_string)
#
# smtp_relay_host: mail.argobox.local
# smtp_relay_password: !vault |
#   $ANSIBLE_VAULT;1.1;AES256
#   31323334353637383940414243444546
#   ... (encrypted block)

# ========= Playbook using vault secrets ====================
# This playbook deploys a Docker stack with database credentials
# pulled from the vault-encrypted vars file.
# ============================================================

---
- name: Deploy application stack with vault secrets
  hosts: docker_hosts
  become: yes
  vars_files:
    # Load encrypted vault file alongside regular vars
    - inventory/group_vars/vault.yml

  vars:
    stack_name: "app-stack"
    stack_dir: "/opt/stacks/{{ stack_name }}"

  tasks:
    - name: Create stack directory
      ansible.builtin.file:
        path: "{{ stack_dir }}"
        state: directory
        owner: commander
        group: docker
        mode: "0750"

    - name: Deploy .env file with vault secrets
      ansible.builtin.template:
        src: templates/stack-env.j2
        dest: "{{ stack_dir }}/.env"
        owner: commander
        group: docker
        mode: "0600"
      # Template would contain:
      # POSTGRES_ROOT_PASSWORD={{ vault_db_root_password }}
      # POSTGRES_GITEA_PASSWORD={{ vault_db_gitea_password }}
      # GRAFANA_ADMIN_PASSWORD={{ vault_grafana_admin_password }}
      # SMTP_PASSWORD={{ vault_smtp_password }}
      no_log: true  # Prevent secrets from appearing in output

    - name: Deploy docker-compose.yml
      ansible.builtin.template:
        src: templates/docker-compose.yml.j2
        dest: "{{ stack_dir }}/docker-compose.yml"
        owner: commander
        group: docker
        mode: "0640"

    - name: Start the stack
      community.docker.docker_compose_v2:
        project_src: "{{ stack_dir }}"
        state: present
      register: stack_result

    - name: Show deployment result
      ansible.builtin.debug:
        msg: "Stack {{ stack_name }} deployed. Services: {{ stack_result.services | default({}) | list | join(', ') }}"

# ========= CI/CD vault integration ==========================
# For automated pipelines where --ask-vault-pass is not viable.
# ============================================================

# Option 1: vault password file (referenced in ansible.cfg)
# --- ansible.cfg ---
# [defaults]
# vault_password_file = .vault-password
#
# .vault-password contains a single line with the passphrase.
# Add to .gitignore so it never enters version control:
#   echo ".vault-password" >> .gitignore

# Option 2: Environment variable lookup
# --- In a playbook or vars file ---
# vault_password_from_env: "{{ lookup('env', 'ANSIBLE_VAULT_PASSWORD') }}"
#
# CI/CD pipeline sets the env var:
#   export ANSIBLE_VAULT_PASSWORD="your-vault-password-here"
#   ansible-playbook site.yml

# Option 3: Script-based vault password
# --- ansible.cfg ---
# vault_password_file = scripts/get-vault-pass.sh
#
# --- scripts/get-vault-pass.sh ---
# #!/bin/bash
# # Pull vault password from a secret manager
# # e.g., from pass, 1Password CLI, or HashiCorp Vault
# pass show ansible/vault-password

# ========= Rekeying vault files =============================
# Change the encryption password on all vault files:
#   ansible-vault rekey inventory/group_vars/vault.yml
#   ansible-vault rekey --new-vault-password-file new-pass.txt *.yml
# ============================================================
terraform terraform-project/

Terraform Backend + Variables

Production Terraform setup with S3-compatible backend (Minio), state locking, typed variables with validation, outputs, and version constraints

hcl
# ===========================================================
# Terraform: Production Project Structure
# ===========================================================
# Directory layout:
#   terraform-project/
#   ├── backend.tf       # State backend configuration
#   ├── versions.tf      # Provider version constraints
#   ├── variables.tf     # Input variable definitions
#   ├── terraform.tfvars # Variable values (do not commit secrets)
#   ├── main.tf          # Resource definitions
#   └── outputs.tf       # Output values
#
# Workflow:
#   terraform init       # Initialize backend + download providers
#   terraform plan       # Preview changes (always review before apply)
#   terraform apply      # Apply changes
#   terraform destroy    # Tear down (use with caution)
# ===========================================================

# ===================== versions.tf ==========================
# Pin provider versions to avoid breaking changes.
# ============================================================

terraform {
  required_version = ">= 1.7.0, < 2.0.0"

  required_providers {
    proxmox = {
      source  = "Telmate/proxmox"
      version = "~> 2.9"
    }
    cloudflare = {
      source  = "cloudflare/cloudflare"
      version = "~> 4.0"
    }
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "~> 2.29"
    }
    helm = {
      source  = "hashicorp/helm"
      version = "~> 2.13"
    }
  }
}

# ===================== backend.tf ===========================
# S3-compatible backend using Minio on the local network.
# State locking prevents concurrent modifications.
# ============================================================

terraform {
  backend "s3" {
    # Minio endpoint at 10.42.0.20 (local S3-compatible storage)
    endpoint = "https://10.42.0.20:9000"
    bucket   = "terraform-state"
    key      = "homelab/argobox.tfstate"
    region   = "us-east-1"  # Required by Minio but ignored

    # State locking via DynamoDB-compatible API (Minio supports this)
    dynamodb_endpoint = "https://10.42.0.20:9001"
    dynamodb_table    = "terraform-locks"

    # Minio uses path-style access, not virtual-hosted
    force_path_style            = true
    skip_credentials_validation = true
    skip_metadata_api_check     = true

    # Credentials: set via environment variables
    # export AWS_ACCESS_KEY_ID="minio-access-key"
    # export AWS_SECRET_ACCESS_KEY="minio-secret-key"
  }
}

# ===================== variables.tf =========================
# Typed variables with validation and sensible defaults.
# ============================================================

variable "environment" {
  description = "Deployment environment"
  type        = string
  default     = "homelab"

  validation {
    condition     = contains(["homelab", "staging", "production"], var.environment)
    error_message = "Environment must be homelab, staging, or production."
  }
}

variable "proxmox_api_url" {
  description = "Proxmox VE API URL"
  type        = string
  default     = "https://10.42.0.201:8006/api2/json"

  validation {
    condition     = can(regex("^https://", var.proxmox_api_url))
    error_message = "Proxmox API URL must use HTTPS."
  }
}

variable "proxmox_api_token" {
  description = "Proxmox API token (user@realm!tokenid=secret)"
  type        = string
  sensitive   = true
}

variable "ssh_public_key" {
  description = "SSH public key for cloud-init provisioned VMs"
  type        = string
}

variable "vm_definitions" {
  description = "Map of VMs to create"
  type = map(object({
    cores    = number
    memory   = number
    disk     = string
    ip       = string
    node     = string
    template = optional(string, "ubuntu-cloud-template")
  }))

  validation {
    condition     = alltrue([for vm in var.vm_definitions : vm.cores >= 1 && vm.cores <= 32])
    error_message = "VM cores must be between 1 and 32."
  }

  validation {
    condition     = alltrue([for vm in var.vm_definitions : vm.memory >= 512])
    error_message = "VM memory must be at least 512 MB."
  }
}

variable "gateway" {
  description = "Default gateway for the 10.42.0.x network"
  type        = string
  default     = "10.42.0.1"
}

variable "dns_servers" {
  description = "DNS servers for VMs"
  type        = list(string)
  default     = ["10.42.0.1", "1.1.1.1"]
}

# ===================== terraform.tfvars =====================
# Example variable values. Copy to terraform.tfvars and fill in.
# Do NOT commit this file if it contains secrets.
# ============================================================

# --- terraform.tfvars ---
# environment = "homelab"
#
# proxmox_api_token = "terraform@pam!tf-token=xxxx-xxxx-xxxx"
#
# ssh_public_key = "ssh-ed25519 AAAAC3NzaC1lZDI1... commander@capella-outpost"
#
# vm_definitions = {
#   "gitea" = {
#     cores    = 2
#     memory   = 4096
#     disk     = "32G"
#     ip       = "10.42.0.30"
#     node     = "izar"
#   }
#   "monitoring" = {
#     cores    = 4
#     memory   = 8192
#     disk     = "64G"
#     ip       = "10.42.0.31"
#     node     = "izar"
#   }
# }

# ===================== outputs.tf ===========================
# Expose useful information after apply.
# ============================================================

# output "vm_ips" {
#   description = "Map of VM names to their IP addresses"
#   value       = { for name, vm in var.vm_definitions : name => vm.ip }
# }
#
# output "vm_ids" {
#   description = "Map of VM names to Proxmox VM IDs"
#   value       = { for name, vm in proxmox_vm_qemu.vms : name => vm.vmid }
# }
#
# output "proxmox_url" {
#   description = "Proxmox web UI URL"
#   value       = var.proxmox_api_url
# }
#
# output "state_backend" {
#   description = "Terraform state location"
#   value       = "s3://terraform-state/homelab/argobox.tfstate @ 10.42.0.20"
# }
terraform k8s-namespaces/main.tf

Terraform K8s Namespace Provisioning

Provision Kubernetes namespaces with ResourceQuotas, LimitRanges, default-deny NetworkPolicies, and registry credentials using for_each for scalable management

hcl
# ===========================================================
# Terraform: Kubernetes Namespace Provisioning
# ===========================================================
# Creates namespaces with security defaults:
#   - ResourceQuota (prevent resource exhaustion)
#   - LimitRange (enforce per-pod limits)
#   - NetworkPolicy (default-deny ingress)
#   - Registry pull secret (private container images)
#
# Uses for_each to manage multiple namespaces from a variable.
# ===========================================================

terraform {
  required_providers {
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "~> 2.29"
    }
  }
}

# Configure the K8s provider to talk to the local cluster
provider "kubernetes" {
  config_path    = "~/.kube/config"
  config_context = "k3s-homelab"
}

# ===================== Variables ============================

variable "namespaces" {
  description = "Map of namespace names to their quota settings"
  type = map(object({
    cpu_request    = string
    cpu_limit      = string
    memory_request = string
    memory_limit   = string
    pod_limit      = number
    labels         = optional(map(string), {})
  }))
  default = {
    "gitea" = {
      cpu_request    = "2"
      cpu_limit      = "4"
      memory_request = "2Gi"
      memory_limit   = "4Gi"
      pod_limit      = 20
      labels         = { tier = "platform", app = "gitea" }
    }
    "monitoring" = {
      cpu_request    = "4"
      cpu_limit      = "8"
      memory_request = "4Gi"
      memory_limit   = "8Gi"
      pod_limit      = 50
      labels         = { tier = "observability" }
    }
    "apps" = {
      cpu_request    = "2"
      cpu_limit      = "4"
      memory_request = "2Gi"
      memory_limit   = "4Gi"
      pod_limit      = 30
      labels         = { tier = "application" }
    }
  }
}

variable "registry_server" {
  description = "Container registry URL"
  type        = string
  default     = "https://registry.10.42.0.199.nip.io"
}

variable "registry_username" {
  description = "Registry pull credentials -- username"
  type        = string
  default     = "commander"
}

variable "registry_password" {
  description = "Registry pull credentials -- password"
  type        = string
  sensitive   = true
}

# ===================== Namespaces ===========================

resource "kubernetes_namespace" "ns" {
  for_each = var.namespaces

  metadata {
    name = each.key

    labels = merge(
      {
        "managed-by"  = "terraform"
        "environment" = "homelab"
      },
      each.value.labels
    )

    annotations = {
      "description" = "Managed by Terraform -- do not modify manually"
    }
  }
}

# ===================== ResourceQuota ========================
# Prevents any single namespace from consuming all cluster resources.

resource "kubernetes_resource_quota" "quota" {
  for_each = var.namespaces

  metadata {
    name      = "${each.key}-quota"
    namespace = kubernetes_namespace.ns[each.key].metadata[0].name
  }

  spec {
    hard = {
      "requests.cpu"    = each.value.cpu_request
      "limits.cpu"      = each.value.cpu_limit
      "requests.memory" = each.value.memory_request
      "limits.memory"   = each.value.memory_limit
      "pods"            = each.value.pod_limit
    }
  }
}

# ===================== LimitRange ===========================
# Sets default requests/limits for pods that do not specify them.

resource "kubernetes_limit_range" "limits" {
  for_each = var.namespaces

  metadata {
    name      = "${each.key}-limits"
    namespace = kubernetes_namespace.ns[each.key].metadata[0].name
  }

  spec {
    limit {
      type = "Container"
      default = {
        cpu    = "500m"
        memory = "512Mi"
      }
      default_request = {
        cpu    = "100m"
        memory = "128Mi"
      }
      max = {
        cpu    = each.value.cpu_limit
        memory = each.value.memory_limit
      }
    }
  }
}

# ===================== NetworkPolicy ========================
# Default-deny ingress for every namespace. Services that need
# ingress must define their own NetworkPolicy to allow it.

resource "kubernetes_network_policy" "default_deny" {
  for_each = var.namespaces

  metadata {
    name      = "default-deny-ingress"
    namespace = kubernetes_namespace.ns[each.key].metadata[0].name
  }

  spec {
    pod_selector {}

    policy_types = ["Ingress"]

    # No ingress rules = deny all ingress by default.
    # Individual apps should create allow policies as needed.
  }
}

# ===================== Registry Secret ======================
# Deploy image pull secrets so pods can pull from private registries.

resource "kubernetes_secret" "registry_creds" {
  for_each = var.namespaces

  metadata {
    name      = "registry-credentials"
    namespace = kubernetes_namespace.ns[each.key].metadata[0].name
  }

  type = "kubernetes.io/dockerconfigjson"

  data = {
    ".dockerconfigjson" = jsonencode({
      auths = {
        (var.registry_server) = {
          username = var.registry_username
          password = var.registry_password
          auth     = base64encode("${var.registry_username}:${var.registry_password}")
        }
      }
    })
  }
}

# ===================== Outputs ==============================

output "namespace_names" {
  description = "Created namespace names"
  value       = [for ns in kubernetes_namespace.ns : ns.metadata[0].name]
}
shell packer/ubuntu-template.pkr.hcl

Packer VM Template

Build immutable VM images for Proxmox using Packer with Ubuntu cloud image source, shell and Ansible provisioners, and automatic template conversion

hcl
# ===========================================================
# Packer: Ubuntu VM Template for Proxmox
# ===========================================================
# Builds an immutable, pre-configured Ubuntu VM template
# that can be cloned by Terraform or the Proxmox UI.
#
# Build command:
#   packer init ubuntu-template.pkr.hcl
#   packer validate -var-file=variables.pkrvars.hcl ubuntu-template.pkr.hcl
#   packer build -var-file=variables.pkrvars.hcl ubuntu-template.pkr.hcl
# ===========================================================

packer {
  required_plugins {
    proxmox = {
      version = ">= 1.1.8"
      source  = "github.com/hashicorp/proxmox"
    }
    ansible = {
      version = ">= 1.1.1"
      source  = "github.com/hashicorp/ansible"
    }
  }
}

# ===================== Variables ============================

variable "proxmox_api_url" {
  type        = string
  description = "Proxmox API endpoint"
  default     = "https://10.42.0.201:8006/api2/json"
}

variable "proxmox_api_token_id" {
  type        = string
  description = "Proxmox API token ID (user@realm!tokenname)"
  sensitive   = true
}

variable "proxmox_api_token_secret" {
  type        = string
  description = "Proxmox API token secret"
  sensitive   = true
}

variable "proxmox_node" {
  type    = string
  default = "izar"
}

variable "template_name" {
  type    = string
  default = "ubuntu-cloud-template"
}

variable "ubuntu_iso" {
  type        = string
  description = "Ubuntu cloud image ISO"
  default     = "local:iso/ubuntu-24.04-live-server-amd64.iso"
}

variable "ssh_username" {
  type    = string
  default = "commander"
}

variable "ssh_password" {
  type      = string
  sensitive = true
  default   = "your-password-here"
}

variable "vm_cores" {
  type    = number
  default = 2
}

variable "vm_memory" {
  type    = number
  default = 4096
}

variable "vm_disk_size" {
  type    = string
  default = "32G"
}

# ===================== Source ===============================
# Proxmox ISO builder -- boots from the Ubuntu ISO, runs
# autoinstall, then hands off to provisioners.
# ============================================================

source "proxmox-iso" "ubuntu" {
  # Connection
  proxmox_url              = var.proxmox_api_url
  username                 = var.proxmox_api_token_id
  token                    = var.proxmox_api_token_secret
  insecure_skip_tls_verify = true
  node                     = var.proxmox_node

  # VM settings
  vm_id   = 9000
  vm_name = var.template_name

  # Hardware
  cores   = var.vm_cores
  sockets = 1
  memory  = var.vm_memory
  os      = "l26"  # Linux 2.6+ kernel

  # Boot disk
  disks {
    storage_pool = "local-zfs"
    disk_size    = var.vm_disk_size
    type         = "scsi"
    format       = "raw"
  }

  # Network
  network_adapters {
    model    = "virtio"
    bridge   = "vmbr0"
    firewall = false
  }

  # ISO and boot
  iso_file = var.ubuntu_iso
  boot_command = [
    "<esc><wait>",
    "linux /casper/vmlinuz --- autoinstall",
    "<enter><wait>",
    "initrd /casper/initrd",
    "<enter><wait>",
    "boot<enter>"
  ]
  boot_wait = "10s"

  # Cloud-init drive
  cloud_init              = true
  cloud_init_storage_pool = "local-zfs"

  # SSH connection for provisioning
  ssh_username = var.ssh_username
  ssh_password = var.ssh_password
  ssh_timeout  = "20m"

  # QEMU guest agent (required for Terraform integration)
  qemu_agent = true

  # Convert to template when done
  template_name        = var.template_name
  template_description = "Ubuntu 24.04 template built by Packer on ${timestamp()}"

  # Tags for organization in the Proxmox UI
  tags = "template;ubuntu;packer"
}

# ===================== Build ================================

build {
  sources = ["source.proxmox-iso.ubuntu"]

  # ---------------------------------------------------------
  # Provisioner 1: Shell -- base system configuration
  # ---------------------------------------------------------
  provisioner "shell" {
    inline = [
      "# Wait for cloud-init to finish",
      "cloud-init status --wait",
      "",
      "# Update package index and upgrade",
      "sudo apt-get update -y",
      "sudo apt-get upgrade -y",
      "",
      "# Install essential packages",
      "sudo apt-get install -y \\",
      "  qemu-guest-agent \\",
      "  curl wget git vim \\",
      "  ca-certificates gnupg \\",
      "  python3 python3-pip \\",
      "  unattended-upgrades \\",
      "  fail2ban \\",
      "  htop tmux jq",
      "",
      "# Enable QEMU guest agent",
      "sudo systemctl enable --now qemu-guest-agent",
      "",
      "# Clean up apt cache to reduce image size",
      "sudo apt-get autoremove -y",
      "sudo apt-get clean",
      "",
      "# Zero free space for better compression",
      "sudo dd if=/dev/zero of=/EMPTY bs=1M 2>/dev/null || true",
      "sudo rm -f /EMPTY",
      "",
      "# Clear machine-id so each clone gets a unique ID",
      "sudo truncate -s 0 /etc/machine-id",
      "sudo rm -f /var/lib/dbus/machine-id"
    ]
  }

  # ---------------------------------------------------------
  # Provisioner 2: File -- copy hardened SSH config
  # ---------------------------------------------------------
  provisioner "file" {
    source      = "files/sshd_config"
    destination = "/tmp/sshd_config"
  }

  provisioner "shell" {
    inline = [
      "sudo mv /tmp/sshd_config /etc/ssh/sshd_config",
      "sudo chown root:root /etc/ssh/sshd_config",
      "sudo chmod 600 /etc/ssh/sshd_config"
    ]
  }

  # ---------------------------------------------------------
  # Provisioner 3: Ansible -- run hardening role
  # ---------------------------------------------------------
  provisioner "ansible" {
    playbook_file = "ansible/harden.yml"
    user          = var.ssh_username
    extra_arguments = [
      "--extra-vars", "target_host=default"
    ]
  }
}

# ===================== variables.pkrvars.hcl ================
# Example variable values file. Copy and fill in secrets.
# ============================================================

# --- variables.pkrvars.hcl ---
# proxmox_api_url          = "https://10.42.0.201:8006/api2/json"
# proxmox_api_token_id     = "packer@pam!packer-token"
# proxmox_api_token_secret = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
# proxmox_node             = "izar"
# ssh_password             = "temporary-build-password"
shell openrc-myapp

Gentoo/OpenRC Service Script

Complete OpenRC init script with depend(), start(), stop(), status(), checkconfig(), start-stop-daemon usage, conf.d configuration, and a comparison with the equivalent systemd unit file

bash
#!/sbin/openrc-run
# ===================================================================
# OpenRC Init Script: myapp
# ===================================================================
# A complete OpenRC service script for a generic application daemon.
# Designed for Gentoo Linux but works on any OpenRC distribution
# (Alpine, Artix, etc.).
#
# Installation:
#   1. Copy this file to /etc/init.d/myapp
#   2. chmod +x /etc/init.d/myapp
#   3. Copy conf.d file to /etc/conf.d/myapp
#   4. rc-update add myapp default
#   5. rc-service myapp start
#
# Management:
#   rc-service myapp start|stop|restart|status
#   rc-service myapp checkconfig    # Validate before starting
# ===================================================================

# ---------------------------------------------------------------
# Service metadata -- used by rc-status and service listings
# ---------------------------------------------------------------
name="myapp"
description="MyApp Application Server"
extra_commands="checkconfig"
extra_started_commands="reload"
description_checkconfig="Verify configuration before start"
description_reload="Reload configuration without restart"

# ---------------------------------------------------------------
# Read configuration from /etc/conf.d/myapp
# Variables set there override the defaults below.
# ---------------------------------------------------------------

# Defaults (overridden by /etc/conf.d/myapp)
: ${MYAPP_USER:="myapp"}
: ${MYAPP_GROUP:="myapp"}
: ${MYAPP_CONFIG:="/etc/myapp/config.yaml"}
: ${MYAPP_DATADIR:="/var/lib/myapp"}
: ${MYAPP_LOGDIR:="/var/log/myapp"}
: ${MYAPP_BIND:="10.42.0.199"}
: ${MYAPP_PORT:="8080"}
: ${MYAPP_EXTRA_OPTS:=""}

command="/usr/local/bin/myapp"
command_args="--config ${MYAPP_CONFIG} --bind ${MYAPP_BIND}:${MYAPP_PORT} ${MYAPP_EXTRA_OPTS}"
command_user="${MYAPP_USER}:${MYAPP_GROUP}"
command_background=true
pidfile="/run/${RC_SVCNAME}.pid"

output_log="${MYAPP_LOGDIR}/${RC_SVCNAME}.log"
error_log="${MYAPP_LOGDIR}/${RC_SVCNAME}.err"

# ---------------------------------------------------------------
# depend() -- declare service dependencies and ordering
# ---------------------------------------------------------------
depend() {
    # 'need' = hard dependency, service will not start without these
    need net
    need localmount

    # 'after' = start after these if they exist (soft ordering)
    after firewall
    after dns
    after postgresql

    # 'use' = use if available, but not required
    use logger
    use dns

    # 'provide' = this service provides the "webapp" virtual service
    provide webapp
}

# ---------------------------------------------------------------
# checkconfig() -- validate configuration before starting
# Called automatically by start_pre(), also callable manually:
#   rc-service myapp checkconfig
# ---------------------------------------------------------------
checkconfig() {
    # Verify the config file exists and is readable
    if [ ! -f "${MYAPP_CONFIG}" ]; then
        eerror "Config file not found: ${MYAPP_CONFIG}"
        return 1
    fi

    # Verify the binary exists
    if [ ! -x "${command}" ]; then
        eerror "Binary not found or not executable: ${command}"
        return 1
    fi

    # Run the application's built-in config validation if available
    if "${command}" --validate-config "${MYAPP_CONFIG}" >/dev/null 2>&1; then
        einfo "Configuration is valid"
    else
        eerror "Configuration validation failed"
        return 1
    fi

    return 0
}

# ---------------------------------------------------------------
# start_pre() -- runs before start(). Create directories, check config.
# ---------------------------------------------------------------
start_pre() {
    # Validate configuration first
    checkconfig || return 1

    # Create required directories with correct ownership
    checkpath --directory --owner "${MYAPP_USER}:${MYAPP_GROUP}" \
        --mode 0750 "${MYAPP_DATADIR}"
    checkpath --directory --owner "${MYAPP_USER}:${MYAPP_GROUP}" \
        --mode 0755 "${MYAPP_LOGDIR}"
    checkpath --directory --owner root:root \
        --mode 0755 /run
}

# ---------------------------------------------------------------
# start() -- launch the daemon using start-stop-daemon
# ---------------------------------------------------------------
start() {
    ebegin "Starting ${name}"
    start-stop-daemon --start \
        --exec "${command}" \
        --user "${MYAPP_USER}" \
        --group "${MYAPP_GROUP}" \
        --background \
        --make-pidfile \
        --pidfile "${pidfile}" \
        --stdout "${output_log}" \
        --stderr "${error_log}" \
        --wait 1000 \
        -- ${command_args}
    eend $?
}

# ---------------------------------------------------------------
# stop() -- gracefully stop the daemon
# ---------------------------------------------------------------
stop() {
    ebegin "Stopping ${name}"
    start-stop-daemon --stop \
        --pidfile "${pidfile}" \
        --retry "SIGTERM/30/SIGKILL/5" \
        --exec "${command}"
    eend $?
}

# ---------------------------------------------------------------
# reload() -- reload configuration without full restart
# Sends SIGHUP to the process, which most daemons handle
# as a config reload signal.
# ---------------------------------------------------------------
reload() {
    checkconfig || return 1
    ebegin "Reloading ${name} configuration"
    start-stop-daemon --signal HUP --pidfile "${pidfile}"
    eend $?
}

# ---------------------------------------------------------------
# status() -- check if the service is running
# ---------------------------------------------------------------
status() {
    if [ -f "${pidfile}" ]; then
        if kill -0 $(cat "${pidfile}") 2>/dev/null; then
            einfo "${name} is running (PID: $(cat "${pidfile}"))"
            return 0
        else
            ewarn "${name} has a stale pidfile (PID: $(cat "${pidfile}"))"
            return 1
        fi
    fi
    einfo "${name} is not running"
    return 3
}

# ===================================================================
# /etc/conf.d/myapp -- Configuration file
# ===================================================================
# Place this content in /etc/conf.d/myapp to override defaults:
#
# # User and group to run as
# MYAPP_USER="myapp"
# MYAPP_GROUP="myapp"
#
# # Configuration file path
# MYAPP_CONFIG="/etc/myapp/config.yaml"
#
# # Data and log directories
# MYAPP_DATADIR="/var/lib/myapp"
# MYAPP_LOGDIR="/var/log/myapp"
#
# # Listen address and port (use 10.42.0.x format)
# MYAPP_BIND="10.42.0.199"
# MYAPP_PORT="8080"
#
# # Additional command-line options
# MYAPP_EXTRA_OPTS="--verbose --max-connections 100"

# ===================================================================
# Equivalent systemd unit file (for reference/comparison)
# ===================================================================
# [Unit]
# Description=MyApp Application Server
# After=network-online.target postgresql.service
# Wants=network-online.target
# Requires=postgresql.service
#
# [Service]
# Type=simple
# User=myapp
# Group=myapp
# ExecStartPre=/usr/local/bin/myapp --validate-config /etc/myapp/config.yaml
# ExecStart=/usr/local/bin/myapp --config /etc/myapp/config.yaml --bind 10.42.0.199:8080
# ExecReload=/bin/kill -HUP $MAINPID
# Restart=on-failure
# RestartSec=5
# StandardOutput=append:/var/log/myapp/myapp.log
# StandardError=append:/var/log/myapp/myapp.err
# LimitNOFILE=65536
#
# [Install]
# WantedBy=multi-user.target