Configuration Files
Configuration Templates
Production-ready configuration files with security best practices and detailed comments. Copy, customize, and deploy.
SSH Hardening
Secure SSH configuration with key-based auth, fail2ban-friendly settings, and modern ciphers.
# /etc/ssh/sshd_config - Hardened SSH Configuration
# Network
Port 22
AddressFamily inet
ListenAddress 0.0.0.0
# Authentication
PermitRootLogin no
PubkeyAuthentication yes
PasswordAuthentication no
PermitEmptyPasswords no
ChallengeResponseAuthentication no
UsePAM yes
# Limit users/groups
AllowGroups ssh-users admin
# Security
X11Forwarding no
AllowTcpForwarding no
AllowAgentForwarding no
PermitTunnel no
MaxAuthTries 3
MaxSessions 5
ClientAliveInterval 300
ClientAliveCountMax 2
# Logging
SyslogFacility AUTH
LogLevel VERBOSE
# Modern ciphers only
Ciphers [email protected],[email protected]
MACs [email protected],[email protected]
KexAlgorithms curve25519-sha256,[email protected] Nginx Reverse Proxy
Production-ready Nginx configuration with SSL, security headers, and upstream proxy.
# /etc/nginx/nginx.conf - Production Configuration
user nginx;
worker_processes auto;
worker_rlimit_nofile 65535;
error_log /var/log/nginx/error.log warn;
pid /run/nginx.pid;
events {
worker_connections 4096;
use epoll;
multi_accept on;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
# Performance
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
types_hash_max_size 2048;
# Security
server_tokens off;
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
# SSL Settings
ssl_protocols TLSv1.2 TLSv1.3;
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1d;
ssl_stapling on;
ssl_stapling_verify on;
# Gzip
gzip on;
gzip_vary on;
gzip_comp_level 6;
gzip_types text/plain text/css application/json application/javascript;
# Rate Limiting
limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
include /etc/nginx/conf.d/*.conf;
} fstab - NAS Mounts
CIFS/SMB mount configuration with credentials, proper options, and boot resilience.
# /etc/fstab - NAS Mount Configuration
# <device> <mount> <type> <options> <dump> <pass>
# Root filesystem
UUID=abc123-def456 / ext4 defaults 0 1
# CIFS/SMB Mounts - Using credentials file for security
# Create credentials file: /etc/cifs-credentials
# Contents:
# username=your_user
# password=your_pass
# domain=WORKGROUP
# Media share - optimized for streaming
//10.42.0.10/media /mnt/nas/media cifs credentials=/etc/cifs-credentials,uid=1000,gid=1000,iocharset=utf8,vers=3.0,_netdev,nofail,x-systemd.automount 0 0
# Backups - mounted on demand
//10.42.0.10/backups /mnt/nas/backups cifs credentials=/etc/cifs-credentials,uid=1000,gid=1000,vers=3.0,_netdev,nofail,noauto,x-systemd.automount 0 0
# Downloads - high performance options
//10.42.0.10/downloads /mnt/nas/downloads cifs credentials=/etc/cifs-credentials,uid=1000,gid=1000,vers=3.0,cache=loose,_netdev,nofail 0 0
# NFS Mount example
# 10.42.0.10:/volume1/data /mnt/nfs/data nfs4 rw,hard,intr,rsize=65536,wsize=65536,_netdev,nofail 0 0 Traefik Static Config
Traefik static configuration with entrypoints, providers, and certificate resolvers.
# /etc/traefik/traefik.yml - Static Configuration
global:
checkNewVersion: false
sendAnonymousUsage: false
log:
level: INFO
format: json
api:
dashboard: true
insecure: false
entryPoints:
web:
address: ":80"
http:
redirections:
entryPoint:
to: websecure
scheme: https
websecure:
address: ":443"
http:
tls:
certResolver: letsencrypt
domains:
- main: "example.com"
sans:
- "*.example.com"
certificatesResolvers:
letsencrypt:
acme:
email: [email protected]
storage: /etc/traefik/acme.json
dnsChallenge:
provider: cloudflare
delayBeforeCheck: 10s
resolvers:
- "1.1.1.1:53"
- "8.8.8.8:53"
providers:
docker:
endpoint: "unix:///var/run/docker.sock"
exposedByDefault: false
network: traefik-public
file:
directory: /etc/traefik/dynamic
watch: true Prometheus Scrape Config
Full Prometheus configuration with scrape jobs, alerting pipeline, recording rules, dynamic file-based discovery, and blackbox probing for external endpoints.
# /etc/prometheus/prometheus.yml
# Prometheus v2.x time-series database and monitoring server
# This is the main configuration file — restart Prometheus or send SIGHUP to reload
global:
# How often Prometheus scrapes targets. 15s is a good balance between
# granularity and load. Going lower (5s) doubles storage and network cost.
# Going higher (60s) means you miss short-lived spikes entirely.
scrape_interval: 15s
# How often Prometheus evaluates alerting and recording rules.
# Keep this equal to or less than scrape_interval so rules see fresh data.
evaluation_interval: 15s
# Scrape timeout per target. If a target takes longer than this to respond,
# the scrape is marked as failed. Set lower than scrape_interval to avoid
# overlapping scrapes stacking up.
scrape_timeout: 10s
# Labels attached to every time series and alert leaving this Prometheus.
# Useful when federating multiple Prometheus instances or sending to
# remote_write destinations like Thanos, Mimir, or Grafana Cloud.
external_labels:
cluster: homelab
env: production
region: local
# ---------------------------------------------------------------
# Alerting — where to send firing alerts
# Prometheus evaluates rules and pushes alerts to Alertmanager,
# which handles deduplication, grouping, silencing, and routing
# to Slack/email/PagerDuty/etc.
# ---------------------------------------------------------------
alerting:
alertmanagers:
- # Use static_configs when Alertmanager runs at a known address.
# For HA Alertmanager clusters, list all instances here —
# Prometheus will send alerts to each one (they deduplicate).
static_configs:
- targets:
- '10.42.0.5:9093'
# How long to wait before marking an Alertmanager as down.
# Shorter timeout = faster failover but more false positives.
timeout: 10s
# API version. v2 is the default since Alertmanager 0.16.
api_version: v2
# ---------------------------------------------------------------
# Rule files — alerting rules and recording rules
# Prometheus loads all .yml files matching these globs.
# Recording rules pre-compute expensive queries and store them
# as new time series, which makes dashboards faster.
# Alerting rules define conditions that trigger alerts.
# ---------------------------------------------------------------
rule_files:
# Alerting rules: disk full, high CPU, service down, etc.
- /etc/prometheus/rules/alerts/*.yml
# Recording rules: pre-aggregated metrics for dashboard performance
- /etc/prometheus/rules/recording/*.yml
# ---------------------------------------------------------------
# Scrape configs — what to monitor and how
# Each job defines a set of targets and how to scrape them.
# Targets can be static, file-based, or discovered via DNS/K8s/etc.
# ---------------------------------------------------------------
scrape_configs:
# -----------------------------------------------------------
# Prometheus self-monitoring
# Prometheus exposes its own metrics at /metrics. Monitoring
# the monitor catches config reload failures, WAL corruption,
# rule evaluation latency, and scrape target health.
# -----------------------------------------------------------
- job_name: 'prometheus'
# Override the global scrape interval for this job.
# Prometheus is cheap to scrape so 10s gives better resolution
# on internal health without meaningful overhead.
scrape_interval: 10s
static_configs:
- targets: ['localhost:9090']
labels:
instance: 'prometheus-primary'
# -----------------------------------------------------------
# Node Exporter — hardware and OS metrics
# CPU, memory, disk I/O, network, filesystem, systemd service
# state, and hundreds more. The foundation of host monitoring.
# Default port: 9100
# -----------------------------------------------------------
- job_name: 'node'
# Scrape every 15s (inherits global). Node exporter is lightweight
# and the metrics are cheap to collect.
static_configs:
- targets:
- '10.42.0.5:9100' # Primary server
- '10.42.0.10:9100' # Proxmox host
- '10.42.0.11:9100' # Secondary host
- '10.42.0.50:9100' # Database server
# Relabel to extract a clean hostname from the target address.
# Without this, the instance label is "10.42.0.10:9100" which
# clutters dashboard dropdowns and alert messages.
metric_relabel_configs:
- source_labels: [__name__]
# Drop high-cardinality metrics that bloat storage
# without providing actionable insight.
regex: 'node_scrape_collector_duration_seconds'
action: drop
# -----------------------------------------------------------
# Docker daemon metrics
# Docker exposes engine metrics when you enable the experimental
# metrics endpoint. Add to /etc/docker/daemon.json:
# { "metrics-addr": "10.42.0.5:9323", "experimental": true }
# Tracks container counts, image pulls, build cache, and
# runtime health of the Docker engine itself.
# -----------------------------------------------------------
- job_name: 'docker'
static_configs:
- targets: ['10.42.0.5:9323']
labels:
instance: 'docker-primary'
# Docker metrics endpoint can be slow during heavy builds.
# Give it extra time before marking the scrape as failed.
scrape_timeout: 15s
# -----------------------------------------------------------
# cAdvisor — per-container resource usage
# Tracks CPU, memory, network, and disk I/O for every running
# container. Essential for right-sizing containers and catching
# memory leaks. Pairs with node_exporter to give full host +
# container visibility.
# Default port: 8080
# -----------------------------------------------------------
- job_name: 'cadvisor'
static_configs:
- targets: ['10.42.0.5:8080']
labels:
instance: 'cadvisor-primary'
# cAdvisor exposes a LOT of high-cardinality metrics by default.
# Drop the ones that generate thousands of series per container
# without adding diagnostic value.
metric_relabel_configs:
- source_labels: [__name__]
regex: 'container_tasks_state|container_memory_failures_total'
action: drop
# Drop metrics for the cAdvisor container itself to avoid
# recursive monitoring noise.
- source_labels: [name]
regex: ''
action: drop
# -----------------------------------------------------------
# Traefik reverse proxy metrics
# Traefik exposes request counts, durations, open connections,
# TLS certificate expiry, and entrypoint health. Enable metrics
# in traefik.yml with:
# metrics:
# prometheus:
# entryPoint: metrics
# Default metrics port: 8080 (same as the API/dashboard)
# -----------------------------------------------------------
- job_name: 'traefik'
static_configs:
- targets: ['10.42.0.5:8080']
labels:
instance: 'traefik-primary'
# Only grab metrics, not the dashboard HTML.
metrics_path: /metrics
# -----------------------------------------------------------
# Blackbox Exporter — probe external URLs
# The blackbox exporter performs HTTP, HTTPS, DNS, TCP, and ICMP
# checks against external targets. Use it to monitor uptime of
# public-facing services, SSL certificate expiry, and DNS
# resolution from your network's perspective.
# Runs at: 10.42.0.5:9115
# -----------------------------------------------------------
- job_name: 'blackbox-http'
# Probe every 30s. External checks are heavier than local scrapes
# and can trigger rate limits if done too aggressively.
scrape_interval: 30s
metrics_path: /probe
params:
# Which module from blackbox.yml to use for this job.
# http_2xx expects a 200-299 response code.
module: [http_2xx]
static_configs:
- targets:
# List the URLs you want to probe for availability.
# These are the TARGETS being probed, not the blackbox exporter itself.
- 'https://grafana.example.com'
- 'https://git.example.com'
- 'https://example.com'
labels:
probe_type: 'http'
relabel_configs:
# The blackbox exporter expects the probe target as a URL parameter.
# This relabeling passes the target URL to the exporter's /probe endpoint.
- source_labels: [__address__]
target_label: __param_target
# Preserve the original target URL as the instance label
# so dashboards show "https://grafana.example.com" not "10.42.0.5:9115".
- source_labels: [__param_target]
target_label: instance
# Point all scrapes at the blackbox exporter service.
- target_label: __address__
replacement: '10.42.0.5:9115'
# -----------------------------------------------------------
# File-based service discovery
# Instead of hardcoding every target in this file, point Prometheus
# at a directory of JSON/YAML files. External tools (Ansible,
# Terraform, custom scripts) can add/remove targets by writing
# files, and Prometheus picks up changes automatically without restart.
#
# Example target file (/etc/prometheus/file_sd/custom_targets.yml):
# - targets: ['10.42.0.60:9100']
# labels:
# job: 'node'
# environment: 'staging'
# -----------------------------------------------------------
- job_name: 'file-sd-custom'
file_sd_configs:
- files:
# Glob pattern — Prometheus watches these files for changes.
# Refresh interval controls how often it re-reads the directory.
- /etc/prometheus/file_sd/*.yml
- /etc/prometheus/file_sd/*.json
refresh_interval: 30s
# -----------------------------------------------------------
# Kubernetes API server (if using K3s/K8s)
# Scrapes the Kubernetes API server metrics endpoint for
# request latency, etcd health, and admission controller stats.
# Only relevant if you run K3s or full Kubernetes.
# -----------------------------------------------------------
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]
action: keep
regex: default;kubernetes Fail2ban Jail Config
Full fail2ban configuration with incremental ban escalation, email notifications, per-service jails for SSH/Nginx/Traefik, bot detection, and a recidive jail for repeat offenders.
# /etc/fail2ban/jail.local
# This file overrides /etc/fail2ban/jail.conf — never edit jail.conf directly
# because package updates will overwrite it. jail.local takes precedence.
# After changes: fail2ban-client reload
# Check status: fail2ban-client status
# Check a jail: fail2ban-client status sshd
[DEFAULT]
# ---------------------------------------------------------------
# Ban timing and escalation
# ---------------------------------------------------------------
# Base ban duration for first offense. This is the starting point
# before the multiplier kicks in. 1 hour is enough to block
# automated scanners without permanently locking out someone
# who fat-fingered a password.
bantime = 1h
# Window in which maxretry failures must occur to trigger a ban.
# 10 minutes is tight enough to catch brute-force attacks but
# won't penalize someone who fails twice in the morning and
# once in the afternoon.
findtime = 10m
# Number of failures allowed before banning. This is the global
# default — individual jails can override it.
maxretry = 5
# Incremental ban escalation (fail2ban 0.11+)
# Each subsequent ban for the same IP multiplies the bantime by
# the next value in this sequence. First ban = 1h, second = 2h,
# third = 4h, fourth = 8h, etc. This punishes repeat offenders
# progressively harder without permanently banning on first offense.
bantime.increment = true
bantime.multiplier = 1 2 4 8 16 32 64
# Maximum ban duration regardless of multiplier. Without this cap,
# the 64x multiplier on a 1h base would mean 64 hours (2.6 days).
# 1 week is a reasonable ceiling — anything beyond that should
# be handled by permanent firewall rules, not fail2ban.
bantime.maxtime = 1w
# Factor to apply when searching the database for previous bans.
# 1 means only exact IP matches count toward escalation.
# Set to a lower value if you want /24 subnet matching.
bantime.factor = 1
# ---------------------------------------------------------------
# Actions — what happens when an IP is banned
# ---------------------------------------------------------------
# The ban action determines which firewall backend to use.
# iptables-multiport bans the IP on the specific ports defined
# in each jail. Use nftables-multiport if your system uses nftables.
banaction = iptables-multiport
banaction_allports = iptables-allports
# action_mwl = ban + mail with whois lookup + relevant log lines.
# This sends an email for every ban with the offender's IP, whois
# info, and the log entries that triggered the ban. Useful for
# spotting targeted attacks vs random scanning.
# Other options:
# action_: ban only (no notification)
# action_mw: ban + mail with whois (no log lines)
# action_mwl: ban + mail + whois + log lines (most verbose)
action = %(action_mwl)s
# ---------------------------------------------------------------
# Email notification settings
# These variables are used by the action_mw and action_mwl actions.
# ---------------------------------------------------------------
# Where ban notifications are sent.
destemail = [email protected]
# The From address on notification emails. Use a real domain so
# your mail server doesn't reject it as spoofed.
sender = [email protected]
# Mail transfer agent. sendmail works on most systems. Use
# mail if sendmail isn't installed, or msmtp for relay setups.
mta = sendmail
# ---------------------------------------------------------------
# Whitelist — never ban these addresses
# ---------------------------------------------------------------
# Always whitelist your own network. Locking yourself out of SSH
# because you mistyped your password 3 times is a bad day.
# Include loopback (127.0.0.1), IPv6 loopback (::1), and your
# entire LAN subnet.
ignoreip = 127.0.0.1/8 ::1 10.42.0.0/24
# How fail2ban looks up IPs. Use "dns" if you want reverse DNS
# in logs, "no" if DNS is unreliable and you want pure IPs.
usedns = warn
# ---------------------------------------------------------------
# Backend — how fail2ban reads log files
# ---------------------------------------------------------------
# auto = tries pyinotify, then systemd, then polling.
# systemd = reads from journald (use if your services log there).
# polling = fallback, checks files on a timer (works everywhere).
backend = auto
# ===============================================================
# JAILS — one section per service to protect
# ===============================================================
# ---------------------------------------------------------------
# SSH — the most targeted service on any public-facing server
# Brute-force bots hit SSH constantly. 3 retries with a 24h ban
# is aggressive but necessary. Key-based auth is your real defense;
# fail2ban is the second layer.
# ---------------------------------------------------------------
[sshd]
enabled = true
port = ssh
filter = sshd
logpath = /var/log/auth.log
# Only 3 attempts before ban — SSH should use key auth, so
# legitimate password failures are rare.
maxretry = 3
# Override the default 1h ban. SSH brute-force deserves a full
# 24h timeout on first offense, escalating from there.
bantime = 24h
# Aggressive findtime — 3 failures in 5 minutes is a clear attack
findtime = 5m
# ---------------------------------------------------------------
# Nginx — HTTP basic auth failures
# Catches brute-force against .htpasswd-protected endpoints.
# The filter matches "no user/password was provided" and
# "user not found" entries in the Nginx error log.
# ---------------------------------------------------------------
[nginx-http-auth]
enabled = true
filter = nginx-http-auth
port = http,https
logpath = /var/log/nginx/error.log
maxretry = 5
# ---------------------------------------------------------------
# Nginx — bot/vulnerability scanner detection
# Catches bots probing for wp-admin, phpmyadmin, .env files,
# and other common attack paths. These requests usually come in
# bursts from automated scanners. 2 retries because legitimate
# users never request /wp-login.php on a non-WordPress site.
# ---------------------------------------------------------------
[nginx-botsearch]
enabled = true
filter = nginx-botsearch
port = http,https
logpath = /var/log/nginx/access.log
# Low threshold — 2 hits on scanner paths is enough.
# Nobody accidentally browses to /phpmyadmin twice.
maxretry = 2
# ---------------------------------------------------------------
# Nginx — rate limit violations
# Pairs with Nginx limit_req_zone directives. When a client
# exceeds the rate limit, Nginx logs "limiting requests" in
# the error log. Fail2ban catches repeat rate-limit violators
# and bans them at the firewall level, which is more efficient
# than letting Nginx handle the rejection.
# ---------------------------------------------------------------
[nginx-limit-req]
enabled = true
filter = nginx-limit-req
port = http,https
logpath = /var/log/nginx/error.log
maxretry = 10
findtime = 1m
# ---------------------------------------------------------------
# Traefik — authentication failures
# Matches 401 responses from Traefik's access log.
# Requires Traefik access logging enabled in traefik.yml:
# accessLog:
# filePath: /var/log/traefik/access.log
# ---------------------------------------------------------------
[traefik-auth]
enabled = true
filter = traefik-auth
port = http,https
logpath = /var/log/traefik/access.log
maxretry = 5
findtime = 10m
# ---------------------------------------------------------------
# Traefik — bot and vulnerability scanner detection
# Same concept as nginx-botsearch but for Traefik access logs.
# Catches requests for well-known exploit paths routed through
# the Traefik reverse proxy. Requires a custom filter file at
# /etc/fail2ban/filter.d/traefik-botsearch.conf with a regex
# matching common scanner paths in the Traefik log format.
# ---------------------------------------------------------------
[traefik-botsearch]
enabled = true
filter = traefik-botsearch
port = http,https
logpath = /var/log/traefik/access.log
# Very low tolerance — these are always malicious.
maxretry = 2
findtime = 1m
# Ban for 48h on first offense. Bots rotate IPs but catching
# even a few slows down the attack surface.
bantime = 48h
# ---------------------------------------------------------------
# Recidive — the jail that watches other jails
# This is the nuclear option. If an IP gets banned by ANY jail
# 5 times within 24 hours, recidive bans them across ALL ports
# for 1 full week. This catches attackers who rotate between
# attack vectors (SSH brute-force, then web scanning, then
# auth bypass attempts).
#
# The filter reads fail2ban's own log to detect repeat bans.
# Think of it as "you've been banned 5 times today, you're
# done for the week."
# ---------------------------------------------------------------
[recidive]
enabled = true
filter = recidive
# Ban on ALL ports, not just the ones from the original jail.
# An attacker who hits SSH and HTTP is probing everything.
banaction = iptables-allports
logpath = /var/log/fail2ban.log
# 5 bans from any combination of jails within 24 hours
maxretry = 5
findtime = 24h
# Full week ban. Combined with bantime.increment, repeat
# recidive bans escalate even further.
bantime = 1w HAProxy Load Balancer
Production HAProxy config with frontend/backend definitions, health checks, stats dashboard, and SSL termination.
# /etc/haproxy/haproxy.cfg - Production Load Balancer
global
log /dev/log local0
log /dev/log local1 notice
chroot /var/lib/haproxy
stats socket /run/haproxy/admin.sock mode 660 level admin
stats timeout 30s
user haproxy
group haproxy
daemon
# SSL tuning
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
tune.ssl.default-dh-param 2048
defaults
log global
mode http
option httplog
option dontlognull
option forwardfor
option http-server-close
timeout connect 5s
timeout client 30s
timeout server 30s
timeout http-keep-alive 10s
timeout check 5s
retries 3
errorfile 400 /etc/haproxy/errors/400.http
errorfile 403 /etc/haproxy/errors/403.http
errorfile 408 /etc/haproxy/errors/408.http
errorfile 500 /etc/haproxy/errors/500.http
errorfile 502 /etc/haproxy/errors/502.http
errorfile 503 /etc/haproxy/errors/503.http
errorfile 504 /etc/haproxy/errors/504.http
# Stats dashboard — restrict to internal network
listen stats
bind 10.42.0.5:8404
mode http
stats enable
stats uri /haproxy-stats
stats refresh 10s
stats admin if LOCALHOST
stats auth admin:changeme_stats_password
acl internal_net src 10.42.0.0/24
http-request deny unless internal_net
# HTTPS frontend — SSL termination
frontend https_front
bind *:443 ssl crt /etc/haproxy/certs/wildcard.pem alpn h2,http/1.1
http-request set-header X-Forwarded-Proto https
http-request set-header X-Real-IP %[src]
# ACL routing by hostname
acl host_app1 hdr(host) -i app1.example.com
acl host_app2 hdr(host) -i app2.example.com
acl host_api hdr(host) -i api.example.com
use_backend app1_servers if host_app1
use_backend app2_servers if host_app2
use_backend api_servers if host_api
default_backend app1_servers
# HTTP frontend — redirect to HTTPS
frontend http_front
bind *:80
http-request redirect scheme https code 301
# App1 backend — round robin with health checks
backend app1_servers
balance roundrobin
option httpchk GET /health
http-check expect status 200
server app1-node1 10.42.0.21:8080 check inter 5s fall 3 rise 2
server app1-node2 10.42.0.22:8080 check inter 5s fall 3 rise 2
server app1-node3 10.42.0.23:8080 check inter 5s fall 3 rise 2 backup
# App2 backend — least connections
backend app2_servers
balance leastconn
option httpchk GET /ping
http-check expect status 200
cookie SERVERID insert indirect nocache
server app2-node1 10.42.0.31:3000 check cookie s1
server app2-node2 10.42.0.32:3000 check cookie s2
# API backend — weighted routing with connection limits
backend api_servers
balance roundrobin
option httpchk GET /api/v1/health
http-check expect status 200
http-request set-header X-Forwarded-Port %[dst_port]
server api-node1 10.42.0.41:9000 check weight 100 maxconn 500
server api-node2 10.42.0.42:9000 check weight 100 maxconn 500 WireGuard Server
WireGuard VPN server config with interface setup, iptables NAT rules, and multi-peer definitions.
# /etc/wireguard/wg0.conf - WireGuard Server Configuration
[Interface]
# Server private key (generate: wg genkey > privatekey)
PrivateKey = SERVER_PRIVATE_KEY_HERE
# VPN subnet — server gets .1
Address = 10.42.100.1/24
# Listen on default WireGuard port
ListenPort = 51820
# Save peer state on restart
SaveConfig = false
# NAT forwarding rules — enable on up, clean on down
# Replace eth0 with your actual WAN interface
PostUp = iptables -A FORWARD -i wg0 -j ACCEPT
PostUp = iptables -A FORWARD -o wg0 -j ACCEPT
PostUp = iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE
PostUp = ip6tables -A FORWARD -i wg0 -j ACCEPT
PostUp = ip6tables -t nat -A POSTROUTING -o eth0 -j MASQUERADE
PostDown = iptables -D FORWARD -i wg0 -j ACCEPT
PostDown = iptables -D FORWARD -o wg0 -j ACCEPT
PostDown = iptables -t nat -D POSTROUTING -o eth0 -j MASQUERADE
PostDown = ip6tables -D FORWARD -i wg0 -j ACCEPT
PostDown = ip6tables -t nat -D POSTROUTING -o eth0 -j MASQUERADE
# DNS pushed to clients (use your own resolver or public)
DNS = 10.42.0.1
# -----------------------------------------------------------
# Peer 1: Laptop — full tunnel (routes all traffic through VPN)
# -----------------------------------------------------------
[Peer]
# Public key from client (generate: wg pubkey < privatekey)
PublicKey = PEER1_PUBLIC_KEY_HERE
# Pre-shared key for additional security (generate: wg genpsk)
PresharedKey = PEER1_PRESHARED_KEY_HERE
# Assigned IP within VPN subnet
AllowedIPs = 10.42.100.10/32
# Keep connection alive behind NAT
PersistentKeepalive = 25
# -----------------------------------------------------------
# Peer 2: Phone — split tunnel (VPN for LAN access only)
# -----------------------------------------------------------
[Peer]
PublicKey = PEER2_PUBLIC_KEY_HERE
PresharedKey = PEER2_PRESHARED_KEY_HERE
AllowedIPs = 10.42.100.11/32
PersistentKeepalive = 25 UFW/nftables Firewall
nftables ruleset with input/forward/output chains, rate limiting, port knocking, and structured logging.
#!/usr/sbin/nft -f
# /etc/nftables.conf - Production Firewall Ruleset
# Flush existing rules
flush ruleset
# -----------------------------------------------
# Main inet table — handles both IPv4 and IPv6
# -----------------------------------------------
table inet firewall {
# Rate limit sets
set rate_limit_ssh {
type ipv4_addr
flags dynamic,timeout
timeout 1m
}
set blocked_hosts {
type ipv4_addr
flags interval
# Add known bad actors here
elements = { 203.0.113.0/24, 198.51.100.0/24 }
}
# -------------------------------------------
# Port knocking — knock sequence: 7000, 8000, 9000
# After correct sequence, SSH opens for 30s
# -------------------------------------------
set knock_stage1 {
type ipv4_addr
flags dynamic,timeout
timeout 10s
}
set knock_stage2 {
type ipv4_addr
flags dynamic,timeout
timeout 10s
}
set knock_passed {
type ipv4_addr
flags dynamic,timeout
timeout 30s
}
chain input {
type filter hook input priority 0; policy drop;
# Allow established/related traffic
ct state established,related accept
ct state invalid drop
# Allow loopback
iif "lo" accept
# Drop blocked hosts early
ip saddr @blocked_hosts counter drop
# ICMP — allow ping with rate limit
ip protocol icmp icmp type echo-request \
limit rate 5/second burst 10 packets accept
ip protocol icmp accept
# ICMPv6 — required for IPv6 to function
ip6 nexthdr icmpv6 accept
# Port knocking sequence
tcp dport 7000 add @knock_stage1 { ip saddr } drop
tcp dport 8000 ip saddr @knock_stage1 \
add @knock_stage2 { ip saddr } drop
tcp dport 9000 ip saddr @knock_stage2 \
add @knock_passed { ip saddr } drop
# SSH — only after successful port knock OR from LAN
tcp dport 22 ip saddr @knock_passed accept
tcp dport 22 ip saddr 10.42.0.0/24 accept
# SSH rate limiting for allowed connections
tcp dport 22 ct state new \
add @rate_limit_ssh { ip saddr limit rate 3/minute } accept
tcp dport 22 ct state new counter drop
# HTTP/HTTPS
tcp dport { 80, 443 } accept
# DNS (if running local resolver)
tcp dport 53 ip saddr 10.42.0.0/24 accept
udp dport 53 ip saddr 10.42.0.0/24 accept
# WireGuard
udp dport 51820 accept
# Prometheus node exporter — LAN only
tcp dport 9100 ip saddr 10.42.0.0/24 accept
# Log and drop everything else
counter log prefix "[nftables-drop-input] " \
level warn drop
}
chain forward {
type filter hook forward priority 0; policy drop;
# Allow established/related forwarded traffic
ct state established,related accept
ct state invalid drop
# WireGuard to LAN forwarding
iifname "wg0" oifname "eth0" accept
iifname "eth0" oifname "wg0" ct state related,established accept
# Docker bridge forwarding (if needed)
iifname "docker0" accept
oifname "docker0" accept
# Log dropped forwards
counter log prefix "[nftables-drop-forward] " \
level warn drop
}
chain output {
type filter hook output priority 0; policy accept;
# Allow all outbound — restrict if paranoid
# Uncomment below to lock down outbound:
# ct state established,related accept
# oif "lo" accept
# tcp dport { 80, 443, 53 } accept
# udp dport { 53, 123, 51820 } accept
# counter log prefix "[nftables-drop-output] " drop
}
}
# -----------------------------------------------
# NAT table for WireGuard / Docker masquerade
# -----------------------------------------------
table inet nat {
chain postrouting {
type nat hook postrouting priority srcnat;
oifname "eth0" ip saddr 10.42.100.0/24 masquerade
}
} Logrotate for Docker
Logrotate configuration for Docker container logs with compression, size limits, and post-rotate signal handling.
# /etc/logrotate.d/docker-containers
# Rotate Docker container JSON log files
#
# Docker default logging driver (json-file) writes to:
# /var/lib/docker/containers/<id>/<id>-json.log
#
# Also configure daemon.json max-size/max-file as a safety net:
# { "log-driver": "json-file", "log-opts": { "max-size": "50m", "max-file": "5" } }
/var/lib/docker/containers/*/*.log {
# Rotate daily, keep 7 days
daily
rotate 7
# Skip missing logs without error
missingok
notifempty
# Compress rotated logs (gzip by default)
compress
delaycompress
# Truncate in place — Docker holds the file descriptor open,
# so we can't move/recreate. copytruncate handles this safely.
copytruncate
# Size guard — rotate early if a container gets chatty
maxsize 100M
minsize 1M
# Date-based extension for easier identification
dateext
dateformat -%Y%m%d
# Permissions for rotated files
create 0640 root docker
# Run shared scripts once for all matching logs
sharedscripts
postrotate
# Signal Docker to reopen log files if using non-copytruncate
# With copytruncate this is optional, but good practice
# for edge cases where a container restarts mid-rotation
if [ -f /var/run/docker.pid ]; then
kill -USR1 $(cat /var/run/docker.pid) 2>/dev/null || true
fi
endscript
}
# Rotate Docker daemon logs separately
/var/log/docker.log {
weekly
rotate 4
compress
delaycompress
missingok
notifempty
copytruncate
create 0640 root adm
}
# Compose project logs (if logging to file via compose)
/var/log/docker-compose/*.log {
daily
rotate 14
compress
delaycompress
missingok
notifempty
copytruncate
create 0640 root docker
maxsize 50M
sharedscripts
postrotate
# Optionally notify monitoring that logs were rotated
# curl -s -X POST http://10.42.0.5:9093/api/v1/alerts \
# -d '[{"labels":{"alertname":"logrotate","severity":"info"}}]' || true
:
endscript
} PostgreSQL Tuning
Tuned postgresql.conf for a homelab server — memory, WAL, connections, logging, and checkpoint settings.
# /etc/postgresql/16/main/postgresql.conf
# Tuned for homelab: 16GB RAM, SSD storage, mixed workload
# Adjust shared_buffers and work_mem based on your available RAM
# -----------------------------------------------
# Connection Settings
# -----------------------------------------------
listen_addresses = '10.42.0.50'
port = 5432
max_connections = 100
superuser_reserved_connections = 3
# -----------------------------------------------
# Authentication
# -----------------------------------------------
password_encryption = scram-sha-256
# -----------------------------------------------
# Memory — tuned for 16GB total system RAM
# -----------------------------------------------
# 25% of system RAM for shared buffer cache
shared_buffers = 4GB
# Estimate of OS disk cache available to Postgres
effective_cache_size = 12GB
# Per-operation sort/hash memory (careful: multiplied by max_connections)
work_mem = 64MB
# Memory for maintenance ops (VACUUM, CREATE INDEX, etc.)
maintenance_work_mem = 1GB
# Huge pages — reduce TLB misses on Linux (set vm.nr_hugepages in sysctl)
huge_pages = try
# -----------------------------------------------
# WAL (Write-Ahead Log)
# -----------------------------------------------
wal_level = replica
max_wal_size = 4GB
min_wal_size = 1GB
wal_buffers = 64MB
wal_compression = zstd
# Checkpoint tuning — spread I/O over longer window
checkpoint_completion_target = 0.9
checkpoint_timeout = 15min
# -----------------------------------------------
# Replication (ready for standby if needed)
# -----------------------------------------------
max_wal_senders = 3
max_replication_slots = 3
hot_standby = on
# -----------------------------------------------
# Query Planner
# -----------------------------------------------
# SSD storage — lower random page cost since seeks are cheap
random_page_cost = 1.1
effective_io_concurrency = 200
default_statistics_target = 200
# Parallel query settings
max_parallel_workers_per_gather = 4
max_parallel_workers = 8
max_parallel_maintenance_workers = 4
# -----------------------------------------------
# Logging
# -----------------------------------------------
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_rotation_age = 1d
log_rotation_size = 100MB
# Log slow queries (anything over 500ms)
log_min_duration_statement = 500
log_checkpoints = on
log_connections = on
log_disconnections = on
log_lock_waits = on
log_temp_files = 0
log_autovacuum_min_duration = 250
log_line_prefix = '%t [%p]: user=%u,db=%d,app=%a,client=%h '
# -----------------------------------------------
# Autovacuum
# -----------------------------------------------
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 30s
autovacuum_vacuum_threshold = 50
autovacuum_vacuum_scale_factor = 0.05
autovacuum_analyze_threshold = 50
autovacuum_analyze_scale_factor = 0.025
# -----------------------------------------------
# Misc
# -----------------------------------------------
timezone = 'UTC'
lc_messages = 'en_US.UTF-8'
shared_preload_libraries = 'pg_stat_statements' Redis/Valkey Config
Redis-compatible config with memory policies, RDB + AOF persistence, auth, and hardened command access.
# /etc/redis/redis.conf (compatible with Valkey)
# Production config for homelab caching + session store
# -----------------------------------------------
# Network
# -----------------------------------------------
bind 10.42.0.50 127.0.0.1
port 6379
protected-mode yes
tcp-backlog 511
timeout 300
tcp-keepalive 60
# -----------------------------------------------
# Authentication
# -----------------------------------------------
# Require password for all client connections
requirepass your_redis_password_here
# ACL for fine-grained access (Redis 6+)
# user default on >your_redis_password_here ~* &* +@all
# user readonly on >readonly_password ~* &* +@read -@dangerous
# user replication on >repl_password ~* &* +psync +replconf +ping
# -----------------------------------------------
# Memory Management
# -----------------------------------------------
# Max memory — leave room for OS and other services
maxmemory 2gb
# Eviction policy when maxmemory is reached
# allkeys-lru: evict least recently used keys (good for cache)
# volatile-lru: only evict keys with TTL set
maxmemory-policy allkeys-lru
# LRU sampling precision (higher = more accurate, slightly slower)
maxmemory-samples 10
# -----------------------------------------------
# RDB Persistence (Point-in-Time Snapshots)
# -----------------------------------------------
# Save if at least N changes in M seconds
save 900 1
save 300 10
save 60 10000
# Stop writes on RDB save failure (data safety)
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename dump.rdb
dir /var/lib/redis
# -----------------------------------------------
# AOF Persistence (Append-Only File)
# -----------------------------------------------
appendonly yes
appendfilename "appendonly.aof"
# fsync policy: everysec balances durability and performance
# always = safest but slow, no = fastest but risky
appendfsync everysec
# Don't fsync during rewrite (better performance, slight risk)
no-appendfsync-on-rewrite yes
# Auto-rewrite AOF when it grows beyond threshold
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 128mb
aof-use-rdb-preamble yes
# -----------------------------------------------
# Security Hardening
# -----------------------------------------------
# Rename dangerous commands to prevent accidental/malicious use
# Empty string disables the command entirely
rename-command FLUSHDB "FLUSHDB_CONFIRMED"
rename-command FLUSHALL "FLUSHALL_CONFIRMED"
rename-command DEBUG ""
rename-command CONFIG "CONFIG_ADMIN"
rename-command SHUTDOWN "SHUTDOWN_ADMIN"
# -----------------------------------------------
# Logging
# -----------------------------------------------
loglevel notice
logfile /var/log/redis/redis-server.log
# -----------------------------------------------
# Slow Log
# -----------------------------------------------
# Log queries slower than 10ms
slowlog-log-slower-than 10000
slowlog-max-len 256
# -----------------------------------------------
# Clients
# -----------------------------------------------
maxclients 1000
# -----------------------------------------------
# Lua Scripting
# -----------------------------------------------
lua-time-limit 5000
# -----------------------------------------------
# Latency Monitoring
# -----------------------------------------------
latency-monitor-threshold 50 Cloudflared Tunnel
Cloudflare Tunnel config with ingress rules for routing external traffic to internal services without port forwarding.
# /etc/cloudflared/config.yml
# Cloudflare Tunnel — zero-trust access to internal services
# No port forwarding required, all traffic goes through Cloudflare edge
# Tunnel UUID (from: cloudflared tunnel create <name>)
tunnel: a1b2c3d4-e5f6-7890-abcd-ef1234567890
# Path to credentials JSON (created during tunnel setup)
credentials-file: /etc/cloudflared/credentials.json
# Logging
loglevel: info
logfile: /var/log/cloudflared/tunnel.log
# Transport settings
protocol: quic
# Metrics server for Prometheus scraping
metrics: 10.42.0.5:9301
# Grace period before forced shutdown
grace-period: 30s
# -----------------------------------------------
# Ingress Rules
# Route external hostnames to internal services
# Rules are evaluated top-to-bottom, first match wins
# -----------------------------------------------
ingress:
# Grafana dashboard
- hostname: grafana.example.com
service: http://10.42.0.5:3000
originRequest:
noTLSVerify: false
connectTimeout: 10s
# Home Assistant
- hostname: homeassistant.example.com
service: http://10.42.0.15:8123
originRequest:
noTLSVerify: false
httpHostHeader: homeassistant.example.com
# Gitea / Forgejo instance
- hostname: git.example.com
service: http://10.42.0.5:3001
originRequest:
connectTimeout: 30s
noTLSVerify: false
# Proxmox web UI — upstream uses self-signed cert
- hostname: proxmox.example.com
service: https://10.42.0.10:8006
originRequest:
noTLSVerify: true
connectTimeout: 10s
# Jellyfin media server
- hostname: media.example.com
service: http://10.42.0.5:8096
originRequest:
connectTimeout: 10s
# SSH access via cloudflared access
- hostname: ssh.example.com
service: ssh://10.42.0.5:22
# Catch-all — MUST be last, returns 404 for unmatched routes
- service: http_status:404 Samba Server
Samba config with global security settings and three share definitions — media, downloads, and admin-only backups.
# /etc/samba/smb.conf - Samba File Server
# Manage users: smbpasswd -a <username>
# Test config: testparm
# Reload: smbcontrol all reload-config
# -----------------------------------------------
# Global Settings
# -----------------------------------------------
[global]
workgroup = WORKGROUP
server string = Samba File Server
server role = standalone server
# Security
security = user
map to guest = never
encrypt passwords = yes
passdb backend = tdbsam
# Disable guest access entirely
restrict anonymous = 2
# Protocol versions — disable SMBv1
server min protocol = SMB2_10
server max protocol = SMB3
# Logging
log file = /var/log/samba/log.%m
max log size = 5000
log level = 1 auth:3
logging = file
# Performance tuning
socket options = TCP_NODELAY IPTOS_LOWDELAY SO_RCVBUF=131072 SO_SNDBUF=131072
read raw = yes
write raw = yes
use sendfile = yes
aio read size = 16384
aio write size = 16384
# Network — bind to specific interface
interfaces = 10.42.0.50/24 127.0.0.1
bind interfaces only = yes
hosts allow = 10.42.0.0/24 127.0.0.1
hosts deny = 0.0.0.0/0
# macOS compatibility (disable if not needed)
vfs objects = fruit streams_xattr
fruit:metadata = stream
fruit:model = MacSamba
fruit:posix_rename = yes
fruit:veto_appledouble = no
fruit:nfs_aces = no
fruit:wipe_intentionally_left_blank_rfork = yes
fruit:delete_empty_adfiles = yes
# Disable printing (homelab, not an office)
load printers = no
printing = bsd
printcap name = /dev/null
disable spoolss = yes
# -----------------------------------------------
# Media Share — read-only for all authenticated users
# -----------------------------------------------
[media]
comment = Media Library (Read Only)
path = /srv/samba/media
browseable = yes
read only = yes
guest ok = no
valid users = @mediagroup @admin
# Permissions for new files (if write enabled via admin)
create mask = 0644
directory mask = 0755
force group = mediagroup
# -----------------------------------------------
# Downloads — read-write for authenticated users
# -----------------------------------------------
[downloads]
comment = Downloads (Read-Write)
path = /srv/samba/downloads
browseable = yes
read only = no
guest ok = no
valid users = @users @admin
write list = @users @admin
# Permissions
create mask = 0664
directory mask = 0775
force group = users
# Recycle bin — don't lose files to accidental deletes
vfs objects = recycle
recycle:repository = .recycle/%U
recycle:keeptree = yes
recycle:versions = yes
recycle:touch = yes
recycle:maxsize = 0
# -----------------------------------------------
# Backups — admin-only access
# -----------------------------------------------
[backups]
comment = System Backups (Admin Only)
path = /srv/samba/backups
browseable = no
read only = no
guest ok = no
valid users = @admin
write list = @admin
# Strict permissions
create mask = 0600
directory mask = 0700
force user = root
force group = admin
# Hide dot files and system directories
hide dot files = yes
veto files = /lost+found/.Trash*/
delete veto files = no SSH Client Config
Complete SSH client configuration with bastion/jump host patterns, connection multiplexing, per-host identity files, port forwarding, and conditional matching for inside/outside network detection.
# ~/.ssh/config - SSH Client Configuration
# This file controls how YOUR machine connects to remote hosts.
# Not to be confused with sshd_config, which controls the server side.
#
# SSH reads this file top-to-bottom and uses the FIRST matching value
# for each directive. Put specific Host blocks BEFORE wildcards.
# After changes, no restart needed — SSH reads this file on every connection.
# ---------------------------------------------------------------
# Include modular configs
# Split your SSH config into multiple files for organization.
# Useful when managing dozens of hosts across different environments.
# Files in config.d/ can be team-shared or auto-generated by tools
# like Ansible or Terraform.
# ---------------------------------------------------------------
Include ~/.ssh/config.d/*
# ---------------------------------------------------------------
# Bastion / Jump Host
# All connections to *.internal hosts go through the bastion.
# ProxyJump is the modern replacement for ProxyCommand with
# netcat. It handles multiple hops, agent forwarding, and
# connection multiplexing automatically.
# ---------------------------------------------------------------
Host bastion
HostName 10.42.0.1
User commander
Port 22
IdentityFile ~/.ssh/id_ed25519_bastion
# The bastion itself should never forward your agent.
# If the bastion is compromised, an attacker with your forwarded
# agent can authenticate as you to any host that trusts your key.
ForwardAgent no
# ---------------------------------------------------------------
# Conditional matching — detect if we're inside the LAN
# Match exec runs a command; if it exits 0, the block applies.
# This ping test checks if the gateway is reachable, meaning
# we're on the local network and don't need the bastion.
# When outside the network (VPN down, on travel), the ping fails
# and SSH falls through to the next matching block which uses ProxyJump.
# ---------------------------------------------------------------
Match host *.internal exec "ping -c1 -W1 10.42.0.1 >/dev/null 2>&1"
# On the local network: connect directly, no jump host needed.
# accept-new automatically accepts host keys the first time you
# connect to a new host, but rejects CHANGED keys (MITM protection).
# Safe on a trusted LAN where you're adding new VMs regularly.
StrictHostKeyChecking accept-new
ProxyJump none
# ---------------------------------------------------------------
# Wildcard block for all *.internal hosts
# This applies to any host ending in .internal that didn't match
# the conditional block above (meaning we're outside the LAN).
# ---------------------------------------------------------------
Host *.internal
User commander
# When outside the LAN, route through the bastion host.
# SSH opens a connection to bastion first, then tunnels
# through it to reach the final destination.
ProxyJump bastion
# Full host key verification when coming through the bastion.
# Outside the network, we can't trust new keys automatically.
StrictHostKeyChecking yes
# Forward the SSH agent to the target host through the bastion.
# WARNING: ForwardAgent lets the remote host use your local SSH keys
# to authenticate to OTHER hosts. Only enable this on hosts you fully
# trust. A compromised remote host with your forwarded agent can
# impersonate you to any server that accepts your key.
# Prefer ProxyJump chains instead of ForwardAgent when possible.
ForwardAgent no
# ---------------------------------------------------------------
# Per-host definitions — homelab machines
# Each host gets its own identity file. Using separate keys per host
# means a compromised key only affects one machine, and you can
# revoke access to a single host without regenerating everything.
# ---------------------------------------------------------------
# Primary Proxmox hypervisor
Host izar
HostName 10.42.0.10
User root
IdentityFile ~/.ssh/id_ed25519_izar
# Proxmox web UI access through SSH tunnel.
# After connecting: open https://localhost:8006 in your browser.
# This avoids exposing the Proxmox web UI to the network.
LocalForward 8006 127.0.0.1:8006
# Secondary Proxmox host
Host arcturus
HostName 10.42.0.11
User root
IdentityFile ~/.ssh/id_ed25519_arcturus
LocalForward 8006 127.0.0.1:8007
# Services gateway
Host altair
HostName 10.42.0.5
User commander
IdentityFile ~/.ssh/id_ed25519_altair
# Tunnel to Grafana running on this host.
# Connect, then open http://localhost:3000 locally.
LocalForward 3000 127.0.0.1:3000
# Database server — tunnel for direct DB access
Host capella-db
HostName 10.42.0.50
User commander
IdentityFile ~/.ssh/id_ed25519_capella
# PostgreSQL tunnel. Lets you run psql or a GUI client
# against localhost:5432 while the actual database is
# only listening on the remote host's local interface.
# Much safer than exposing Postgres to the network.
LocalForward 5432 127.0.0.1:5432
# Redis tunnel on a non-standard local port to avoid
# conflicts if you run Redis locally.
LocalForward 6380 127.0.0.1:6379
# Desktop workstation
Host capella
HostName 10.42.0.100
User commander
IdentityFile ~/.ssh/id_ed25519_capella
# ---------------------------------------------------------------
# Connection multiplexing (ControlMaster)
# Reuses a single TCP connection for multiple SSH sessions to
# the same host. First connection opens the master socket;
# subsequent connections piggyback on it. Eliminates the TCP
# handshake, key exchange, and authentication overhead for
# every new session.
#
# ControlPersist keeps the master connection alive for 10 minutes
# after the last session closes, so rapid reconnects are instant.
#
# ControlPath defines where the socket file lives. %r=remote user,
# %h=host, %p=port ensures one socket per unique connection.
# ---------------------------------------------------------------
Host *
ControlMaster auto
ControlPath ~/.ssh/sockets/%r@%h-%p
ControlPersist 10m
# ---------------------------------------------------------------
# Keepalive — detect dead connections
# ServerAliveInterval sends a keepalive packet every 60 seconds
# through the encrypted channel. If 3 consecutive keepalives
# get no response (3 minutes total), SSH closes the connection.
# This prevents "frozen" terminals when a network path dies.
# Also keeps NAT tables alive on firewalls/routers that drop
# idle connections after a timeout.
# ---------------------------------------------------------------
ServerAliveInterval 60
ServerAliveCountMax 3
# Default identity file for hosts without a specific key.
IdentityFile ~/.ssh/id_ed25519
# Only offer the identity files configured for this host.
# Without this, SSH tries every key in ~/.ssh/ against every
# host, which can trigger fail2ban on the remote side if you
# have many keys (each key attempt counts as a failed auth).
IdentitiesOnly yes
# Hash hostnames in known_hosts. If your laptop is stolen,
# the attacker can't read known_hosts to learn your infrastructure.
# The downside is you can't grep known_hosts for a hostname anymore.
HashKnownHosts yes
# Use modern key exchange and ciphers by default.
# Matches the server-side sshd_config for a clean handshake.
KexAlgorithms curve25519-sha256,[email protected]
Ciphers [email protected],[email protected]
# Request compression for all connections. Helps on slow links,
# negligible overhead on fast LANs.
Compression yes
# Visual host key fingerprint. Shows a randomart image on connect
# so you can visually spot if a host key changed (MITM detection).
VisualHostKey yes Unbound Recursive DNS
Full recursive DNS resolver with DNSSEC validation, cache tuning, privacy hardening, local zone overrides for internal DNS, and performance tuning for a homelab network.
# /etc/unbound/unbound.conf - Recursive DNS Resolver
#
# Unbound vs Pi-hole vs both:
# - Unbound is a RECURSIVE resolver: it queries root servers and walks
# the DNS tree itself instead of forwarding to Google/Cloudflare.
# This gives you full DNS independence and DNSSEC validation.
# - Pi-hole is a FILTERING layer: it blocks ads/trackers by domain name.
# - Best setup: Pi-hole as the network's DNS server (port 53), forwarding
# to Unbound (port 5335) for recursive resolution. Pi-hole handles
# blocklists, Unbound handles the actual DNS lookups.
# Client -> Pi-hole:53 -> Unbound:5335 -> Root servers
#
# Why recursive instead of forwarding to 1.1.1.1 or 8.8.8.8?
# - Privacy: Cloudflare/Google see every domain you query. With Unbound,
# your queries go directly to authoritative nameservers.
# - DNSSEC: Full validation chain from root to leaf, not trusting a
# third party's validation.
# - Independence: No dependency on upstream resolvers going down or
# being censored.
server:
# ---------------------------------------------------------------
# Network settings
# ---------------------------------------------------------------
# Listen on the LAN-facing interface. Don't use 0.0.0.0 unless
# you want the entire internet using your resolver (open resolver = bad).
interface: 10.42.0.1
# Also listen on localhost for local queries
interface: 127.0.0.1
# Port 5335 instead of 53 because Pi-hole occupies port 53.
# If running Unbound standalone without Pi-hole, use port 53.
port: 5335
# Protocols to serve on. Enable both TCP and UDP.
# TCP is needed for large responses (DNSSEC signatures can exceed
# the 512-byte UDP limit) and zone transfers.
do-ip4: yes
do-ip6: no
do-udp: yes
do-tcp: yes
# ---------------------------------------------------------------
# Access control
# Only allow queries from the local network. Without this,
# anyone who can reach port 5335 can use your resolver,
# which enables DNS amplification attacks.
# ---------------------------------------------------------------
access-control: 10.42.0.0/24 allow
access-control: 127.0.0.0/8 allow
access-control: 0.0.0.0/0 refuse
# ---------------------------------------------------------------
# DNSSEC validation
# Validates the cryptographic chain of trust from the DNS root
# all the way to the queried domain. If a response fails
# validation, Unbound returns SERVFAIL instead of poisoned data.
# The auto-trust-anchor-file is maintained by unbound-anchor
# and contains the root zone trust anchor (KSK).
# Run 'unbound-anchor' periodically (cron) to keep it current.
# ---------------------------------------------------------------
auto-trust-anchor-file: /var/lib/unbound/root.key
# Harden against out-of-zone data in responses.
# Prevents a malicious authoritative server from injecting
# records for domains it doesn't own.
harden-glue: yes
# Require DNSSEC data for trust-anchored zones. If a domain
# is supposed to be signed but the signatures are missing,
# treat it as bogus (validation failure).
harden-dnssec-stripped: yes
# Use 0x20-encoded random bits in query names for additional
# forgery resistance. Mixes case in the query (e.g., "eXaMpLe.CoM")
# and verifies the response matches. Makes cache poisoning harder.
use-caps-for-id: yes
# ---------------------------------------------------------------
# Privacy settings
# ---------------------------------------------------------------
# Don't reveal the server identity in CH TXT queries.
# Without this, anyone can query "hostname.bind" or "id.server"
# to learn your resolver's identity.
hide-identity: yes
hide-version: yes
# QNAME minimisation: instead of sending the full query name
# (e.g., "mail.sub.example.com") to every server in the chain,
# only send the minimum needed at each step. Root server sees
# ".com", .com server sees "example.com", etc. Reduces information
# leakage to intermediate nameservers.
qname-minimisation: yes
# Aggressive NSEC: use DNSSEC denial-of-existence records to
# synthesize negative answers from cache. Reduces queries to
# authoritative servers for non-existent domains.
aggressive-nsec: yes
# Don't send the Unbound version in responses
identity: ""
version: ""
# ---------------------------------------------------------------
# Performance tuning
# ---------------------------------------------------------------
# Match thread count to CPU cores. Each thread gets its own
# cache shard, so more threads = more parallel resolution.
# Don't exceed core count — more threads than cores adds
# context switching overhead without benefit.
num-threads: 4
# Message cache: stores full DNS responses (header + answer).
# 50MB is generous for a homelab. Adjust based on unique
# domains queried — 50MB holds roughly 500K-1M entries.
msg-cache-size: 50m
# RRset cache: stores individual resource record sets.
# Should be roughly 2x the msg-cache-size because a single
# message may reference multiple RRsets (A + AAAA + NS + etc).
rrset-cache-size: 100m
# Key cache for DNSSEC validation keys. 25MB is plenty
# unless you're resolving millions of unique signed domains.
key-cache-size: 25m
# Socket buffer sizes. Larger buffers help when handling bursts
# of concurrent queries. 8MB handles a busy LAN without drops.
# May need: sysctl net.core.rmem_max=8388608
so-rcvbuf: 8m
so-sndbuf: 8m
# Prefetch: when a cached entry is about to expire (within 10%
# of its TTL), Unbound fetches a fresh copy in the background.
# The next query gets the cached result instantly instead of
# waiting for a recursive lookup. Trades a small amount of
# extra upstream traffic for consistently fast responses.
prefetch: yes
# Prefetch DNSKEY and DS records for DNSSEC validation before
# they expire. Prevents validation delays on cache expiry.
prefetch-key: yes
# Serve expired cache entries while fetching fresh data in the
# background. If an authoritative server is down, clients still
# get answers (stale but functional) instead of SERVFAIL.
serve-expired: yes
serve-expired-ttl: 86400
# ---------------------------------------------------------------
# Hardening
# ---------------------------------------------------------------
# Minimum TTL for cached records. Overrides the authoritative
# TTL if it's lower than this. Prevents cache-busting attacks
# that set TTL=0 to force constant re-resolution.
cache-min-ttl: 300
# Maximum TTL cap. Even if a record says "cache for 1 week",
# cap it at 1 day so stale records eventually refresh.
cache-max-ttl: 86400
# Unwanted reply threshold. If Unbound receives more than 10M
# unsolicited replies (possible DNS amplification reflection),
# it clears the RRset and message caches. Safety valve against
# cache poisoning via flooding.
unwanted-reply-threshold: 10000000
# Don't allow queries for localhost reverse or RFC1918 reverse
# to leak to the internet. Resolve them locally only.
private-address: 10.0.0.0/8
private-address: 172.16.0.0/12
private-address: 192.168.0.0/16
private-address: 169.254.0.0/16
# ---------------------------------------------------------------
# Local zone overrides — internal DNS
# These zones are authoritative in Unbound. Queries for these
# domains never leave your network.
# "static" means only the records defined here are valid;
# any other query under this zone returns NXDOMAIN.
# ---------------------------------------------------------------
local-zone: "homelab.internal." static
# Map hostnames to IPs for all homelab machines.
# This replaces the need for a full DNS server (BIND/PowerDNS)
# for simple internal name resolution.
local-data: "izar.homelab.internal. A 10.42.0.10"
local-data: "arcturus.homelab.internal. A 10.42.0.11"
local-data: "altair.homelab.internal. A 10.42.0.5"
local-data: "capella.homelab.internal. A 10.42.0.100"
local-data: "sirius.homelab.internal. A 10.42.0.50"
local-data: "gateway.homelab.internal. A 10.42.0.1"
# PTR records for reverse lookups. Lets tools like 'host' and
# 'nslookup' resolve IPs back to hostnames.
local-data-ptr: "10.42.0.10 izar.homelab.internal."
local-data-ptr: "10.42.0.11 arcturus.homelab.internal."
local-data-ptr: "10.42.0.5 altair.homelab.internal."
local-data-ptr: "10.42.0.100 capella.homelab.internal."
local-data-ptr: "10.42.0.50 sirius.homelab.internal."
local-data-ptr: "10.42.0.1 gateway.homelab.internal."
# ---------------------------------------------------------------
# Stub zone for reverse DNS of the local subnet
# Delegates reverse lookups for 10.42.0.x to the local resolver
# instead of sending them upstream (where they'd fail anyway).
# ---------------------------------------------------------------
stub-zone:
name: "0.42.10.in-addr.arpa."
stub-addr: 127.0.0.1@5335
# ---------------------------------------------------------------
# Forward zone example (optional)
# If specific domains need to be resolved by a particular
# upstream server (e.g., corporate VPN split DNS), define
# a forward zone. Queries for this zone skip recursive
# resolution and go directly to the specified forwarders.
# ---------------------------------------------------------------
# forward-zone:
# name: "corp.example.com."
# forward-addr: 10.42.0.1@53
# # forward-tls-upstream: yes # Enable DNS-over-TLS to forwarder Caddyfile
Full Caddy reverse proxy configuration with wildcard TLS via Cloudflare DNS, basicauth, IP allowlists, rate limiting, file serving, path-based routing, health checks, and shared snippets.
# /etc/caddy/Caddyfile - Reverse Proxy Configuration
#
# Caddy vs Traefik vs Nginx:
# - Caddy: automatic HTTPS by default, simple config syntax, great for
# straightforward reverse proxy setups. Config reloads are seamless.
# - Traefik: built for Docker/Kubernetes service discovery via labels.
# Better when containers come and go constantly.
# - Nginx: maximum flexibility and performance, but you manage certs
# manually (or via certbot) and config is more verbose.
#
# Caddy automatically obtains and renews TLS certificates from
# Let's Encrypt. No certbot cron jobs, no manual renewal.
# ---------------------------------------------------------------
# Global options block — applies to all sites
# Must be the FIRST block in the Caddyfile (before any site blocks).
# ---------------------------------------------------------------
{
# Email for Let's Encrypt account registration and certificate
# expiry notifications. Required for ACME certificate issuance.
email [email protected]
# Use Cloudflare DNS challenge for certificate validation.
# This proves domain ownership by creating a TXT record via
# the Cloudflare API, which means:
# 1. No need to expose port 80 for HTTP challenge
# 2. Wildcard certificates work (HTTP challenge can't do wildcards)
# 3. Works behind firewalls and NAT without port forwarding
# Requires the CF_API_TOKEN environment variable.
# (Traefik equivalent: certificatesResolvers.letsencrypt.acme.dnsChallenge)
# (Nginx equivalent: certbot --dns-cloudflare plugin)
acme_dns cloudflare {env.CF_API_TOKEN}
# Enable HTTP/3 (QUIC) alongside HTTP/1.1 and HTTP/2.
# HTTP/3 uses UDP instead of TCP, reducing latency on lossy
# connections (mobile, WiFi). Browsers auto-upgrade if supported.
# (Traefik equivalent: entryPoints.websecure.http3)
# (Nginx equivalent: not natively supported, requires quiche patch)
servers {
protocols h1 h2 h3
}
# Grace period for active connections during config reload.
# Caddy drains existing connections for this long before
# forcing them closed. Prevents dropped requests on reload.
grace_period 10s
}
# ---------------------------------------------------------------
# Shared snippets — reusable config blocks
# Import these in site blocks with: import security_headers
# (Traefik equivalent: middleware chains defined in dynamic config)
# (Nginx equivalent: include /etc/nginx/snippets/security.conf)
# ---------------------------------------------------------------
(security_headers) {
header {
# Prevent clickjacking by disallowing iframe embedding
X-Frame-Options "SAMEORIGIN"
# Stop browsers from MIME-type sniffing responses
X-Content-Type-Options "nosniff"
# Control referrer information sent with requests
Referrer-Policy "strict-origin-when-cross-origin"
# Content Security Policy — adjust per application
Content-Security-Policy "default-src 'self'"
# Opt out of Google FLoC/Topics tracking
Permissions-Policy "interest-cohort=()"
# Remove the Server header to avoid version disclosure
-Server
}
}
(internal_only) {
# Restrict access to the local network only.
# Requests from outside 10.42.0.0/24 get a 403 Forbidden.
# (Traefik equivalent: ipAllowList middleware)
# (Nginx equivalent: allow 10.42.0.0/24; deny all;)
@blocked not remote_ip 10.42.0.0/24
respond @blocked "Forbidden" 403
}
# ---------------------------------------------------------------
# Wildcard TLS certificate
# A single cert covering *.example.com so every subdomain gets
# HTTPS automatically without individual certificate requests.
# (Traefik equivalent: tls.domains[].sans: ["*.example.com"])
# (Nginx equivalent: ssl_certificate for the wildcard cert from certbot)
# ---------------------------------------------------------------
*.example.com {
tls {
dns cloudflare {env.CF_API_TOKEN}
}
}
# ---------------------------------------------------------------
# Grafana — simple reverse proxy
# The most basic pattern: take incoming requests and forward
# them to an internal service. Caddy handles TLS termination,
# compression, and HTTP/2 automatically.
# (Traefik equivalent: Docker label traefik.http.routers.grafana.rule=Host)
# (Nginx equivalent: proxy_pass http://10.42.0.5:3000)
# ---------------------------------------------------------------
grafana.example.com {
import security_headers
reverse_proxy 10.42.0.5:3000 {
# Pass the original Host header to the backend.
# Some apps (Grafana included) need this to generate
# correct URLs in their UI.
header_up Host {host}
header_up X-Real-IP {remote_host}
header_up X-Forwarded-Proto {scheme}
}
}
# ---------------------------------------------------------------
# Proxmox — reverse proxy with IP allowlist
# Admin panels should never be exposed to the public internet.
# The import internal_only snippet restricts access to the LAN.
# (Traefik equivalent: ipAllowList middleware on the router)
# (Nginx equivalent: allow/deny directives in the location block)
# ---------------------------------------------------------------
proxmox.example.com {
import security_headers
import internal_only
reverse_proxy https://10.42.0.10:8006 {
# Proxmox uses a self-signed certificate by default.
# Trust it for the backend connection only.
transport http {
tls_insecure_skip_verify
}
}
}
# ---------------------------------------------------------------
# Git server — reverse proxy with basicauth
# Adds HTTP basic authentication in front of the upstream service.
# Passwords are bcrypt-hashed (generate with: caddy hash-password).
# (Traefik equivalent: basicAuth middleware with usersFile)
# (Nginx equivalent: auth_basic with .htpasswd file)
# ---------------------------------------------------------------
git.example.com {
import security_headers
# Rate limit login attempts to slow down brute-force attacks.
# 5 requests per second with a burst of 10.
# (Traefik equivalent: rateLimit middleware)
# (Nginx equivalent: limit_req_zone + limit_req)
rate_limit {
zone git_limit {
key {remote_host}
events 5
window 1s
}
}
basicauth {
# Username: commander, password hash generated by caddy hash-password
# Never put plaintext passwords here.
commander $2a$14$EXAMPLE_BCRYPT_HASH_HERE
}
reverse_proxy 10.42.0.5:3001
}
# ---------------------------------------------------------------
# App with path-based routing
# Multiple backend services behind a single domain, routed by
# URL path. handle_path strips the prefix before forwarding,
# so the backend sees / instead of /api.
# (Traefik equivalent: PathPrefix routing rule with StripPrefix middleware)
# (Nginx equivalent: location /api { proxy_pass with trailing slash })
# ---------------------------------------------------------------
app.example.com {
import security_headers
# API backend — strip /api prefix before forwarding
handle_path /api/* {
reverse_proxy 10.42.0.5:8080
}
# WebSocket backend — Caddy auto-detects and upgrades
# WebSocket connections, no special config needed.
handle_path /ws/* {
reverse_proxy 10.42.0.5:8081
}
# Health check endpoint — respond directly without proxying.
# Useful for load balancer health probes.
# (Traefik equivalent: ping entrypoint)
# (Nginx equivalent: location /health { return 200; })
handle /health {
respond "OK" 200
}
# Static file server for the frontend SPA
# file_server serves files from disk. try_files falls back
# to index.html for client-side routing (React/Vue/etc).
# (Traefik equivalent: not built-in, needs a separate file server)
# (Nginx equivalent: root + try_files $uri $uri/ /index.html)
handle {
root * /var/www/app/dist
try_files {path} /index.html
file_server
}
}
# ---------------------------------------------------------------
# Static documentation site
# Pure file server with directory browsing disabled.
# Caddy adds gzip/brotli compression automatically.
# ---------------------------------------------------------------
docs.example.com {
import security_headers
root * /var/www/docs
file_server {
# Disable directory listings — only serve actual files
browse
}
# Cache static assets aggressively. HTML gets a short cache
# so updates propagate quickly.
@static path *.css *.js *.png *.jpg *.svg *.woff2
header @static Cache-Control "public, max-age=31536000, immutable"
@html path *.html /
header @html Cache-Control "public, max-age=3600"
} sysctl.conf Network Tuning
Production sysctl configuration for network performance, TCP BBR congestion control, connection tracking, SYN flood protection, buffer tuning, and kernel limits for high-connection servers.
# /etc/sysctl.conf (or /etc/sysctl.d/99-homelab.conf)
# Apply changes: sysctl -p /etc/sysctl.conf
# Verify a value: sysctl net.ipv4.tcp_congestion_control
#
# These settings tune the Linux kernel's network stack and memory
# management for a homelab server running Docker, VPN, reverse proxies,
# and databases. Defaults are conservative for compatibility; these
# values assume a modern server with 16GB+ RAM and gigabit networking.
# ===============================================================
# TCP Congestion Control — BBR
# ===============================================================
# BBR (Bottleneck Bandwidth and Round-trip propagation time) is
# Google's congestion control algorithm. It models the network
# path instead of reacting to packet loss, which gives:
# - Higher throughput on lossy links (WiFi, WAN)
# - Lower latency under congestion
# - Better bandwidth utilization
#
# Default: pfifo_fast (qdisc), cubic (congestion control)
# BBR requires the 'fq' (Fair Queue) qdisc to work correctly.
# Without fq, BBR falls back to cubic-like behavior.
net.core.default_qdisc = fq
net.ipv4.tcp_congestion_control = bbr
# ===============================================================
# TCP/UDP Buffer Sizes
# ===============================================================
# Maximum socket receive/send buffer sizes that any application can
# request via setsockopt(). The kernel won't allocate more than this
# regardless of what an application asks for.
# Default: 212992 (208KB). Too small for high-throughput transfers
# like NFS, iSCSI, or WireGuard on gigabit links.
# 16MB handles 10Gbps links with reasonable RTT.
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
# Default receive/send buffer for sockets that don't explicitly set one.
# Most applications use the default, so this matters more than rmem_max
# for typical traffic.
# Default: 212992 (208KB)
net.core.rmem_default = 1048576
net.core.wmem_default = 1048576
# TCP auto-tuning buffer sizes: min, default, max (in bytes).
# The kernel dynamically adjusts each connection's buffers between
# these bounds based on available memory and connection bandwidth.
# min: 4KB (low-memory safety net)
# default: 1MB (starting point for new connections)
# max: 16MB (ceiling for high-bandwidth paths)
# Default: 4096 131072 6291456
net.ipv4.tcp_rmem = 4096 1048576 16777216
net.ipv4.tcp_wmem = 4096 1048576 16777216
# ===============================================================
# Connection Backlog and Queue Sizes
# ===============================================================
# Maximum number of packets queued on the INPUT side when the
# interface receives packets faster than the kernel can process them.
# Default: 1000. Increase for bursty traffic or multi-queue NICs.
net.core.netdev_budget = 600
net.core.somaxconn = 65535
net.core.netdev_max_backlog = 65536
# ===============================================================
# Connection Tracking (conntrack)
# ===============================================================
# Maximum entries in the connection tracking table. Every connection
# through iptables/nftables NAT or stateful rules uses one entry.
# Default: 65536. Docker alone can exhaust this with a few busy
# containers. Each entry uses ~300 bytes of kernel memory.
# 262144 entries = ~75MB RAM. Scale based on concurrent connections.
net.netfilter.nf_conntrack_max = 262144
# How long established TCP connections stay in the conntrack table
# after the last packet. Default: 432000 (5 days). That's way too
# long — dead connections accumulate and fill the table.
# 1 hour is reasonable for a homelab. Active connections send
# keepalives and renew their entry automatically.
net.netfilter.nf_conntrack_tcp_timeout_established = 3600
# Timeout for connections in TIME_WAIT state in conntrack.
# Default: 120. Can lower to 30 for faster table cleanup.
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 30
# ===============================================================
# SYN Flood Protection
# ===============================================================
# Enable SYN cookies. When the SYN queue fills up (potential SYN flood),
# the kernel encodes connection state in the SYN-ACK's sequence number
# instead of allocating memory. Legitimate clients complete the handshake
# normally; attackers waste their packets.
# Default: 1 (enabled). Keep it on.
net.ipv4.tcp_syncookies = 1
# Maximum number of SYN requests queued (per listening socket) waiting
# for the final ACK of the three-way handshake.
# Default: 128. Absurdly low for any server handling real traffic.
# 65536 handles burst connection spikes without dropping SYNs.
net.ipv4.tcp_max_syn_backlog = 65536
# Number of SYN retransmits before giving up on a new outbound connection.
# Default: 6 (roughly 63 seconds). Lower to 2 for faster failure
# detection on flaky connections.
net.ipv4.tcp_syn_retries = 2
# Number of SYN-ACK retransmits before dropping a half-open connection.
# Default: 5. Lower to 3 for faster cleanup of abandoned handshakes.
net.ipv4.tcp_synack_retries = 3
# ===============================================================
# TCP Keepalive
# ===============================================================
# How long before the first keepalive probe (seconds).
# Default: 7200 (2 hours). Way too long for detecting dead connections.
# 600 seconds (10 minutes) catches dead peers faster and frees resources.
net.ipv4.tcp_keepalive_time = 600
# Interval between keepalive probes after the first one.
# Default: 75. 15 seconds means faster detection after the first probe.
net.ipv4.tcp_keepalive_intvl = 15
# Number of failed keepalive probes before declaring the connection dead.
# Default: 9. With 15s intervals, 5 probes = 75 seconds to detect a
# dead peer (total: 600 + 75 = 675 seconds from last activity).
net.ipv4.tcp_keepalive_probes = 5
# ===============================================================
# TCP TIME_WAIT Handling
# ===============================================================
# Allow reuse of TIME_WAIT sockets for new connections when safe.
# Default: 2 (enabled in recent kernels). Critical for high-connection
# servers (reverse proxies, load balancers) that open/close thousands
# of connections per second. Without this, you run out of local ports.
net.ipv4.tcp_tw_reuse = 1
# ===============================================================
# IP Forwarding
# ===============================================================
# Enable IPv4 packet forwarding. Required for:
# - WireGuard/VPN gateways (routing between VPN and LAN)
# - Docker (container networking uses NAT forwarding)
# - Acting as a router between subnets
# Default: 0 (disabled). ONLY enable on machines that need to route.
# Enabling this on a desktop/workstation is a security risk.
net.ipv4.ip_forward = 1
# IPv6 forwarding. Same concept as IPv4.
# Default: 0. Enable only if routing IPv6 traffic.
net.ipv6.conf.all.forwarding = 1
# ===============================================================
# ARP Cache Tuning
# ===============================================================
# The ARP cache maps IP addresses to MAC addresses. On large networks
# (many Docker containers, VMs, or VPN peers), the default ARP cache
# size is too small, causing "Neighbour table overflow" kernel messages.
#
# gc_thresh1: Minimum entries before garbage collection starts
# gc_thresh2: Soft limit — GC runs every 5 seconds above this
# gc_thresh3: Hard limit — entries above this are dropped immediately
# Default: 128, 512, 1024. Fine for a home network with 10 devices.
# Docker with 50+ containers needs bigger tables.
net.ipv4.neigh.default.gc_thresh1 = 1024
net.ipv4.neigh.default.gc_thresh2 = 4096
net.ipv4.neigh.default.gc_thresh3 = 8192
# ===============================================================
# Memory and Swap
# ===============================================================
# Swappiness controls how aggressively the kernel swaps memory pages
# to disk. Range: 0-100.
# Default: 60 (balanced desktop use).
# For servers: 10 keeps more application data in RAM while still
# allowing swap when genuinely needed. 0 disables swap entirely,
# which risks OOM-kills under memory pressure.
vm.swappiness = 10
# How quickly the kernel reclaims directory and inode cache.
# Default: 100. Lower values (50) make the kernel prefer keeping
# filesystem metadata in cache, improving performance for workloads
# that stat/open many files (build systems, package managers).
vm.vfs_cache_pressure = 50
# ===============================================================
# File and Inotify Limits
# ===============================================================
# Maximum number of open file descriptors system-wide.
# Each network connection, open file, and pipe uses one FD.
# Default: ~100000 (varies by RAM). 1M handles high-connection servers
# running reverse proxies and database connection pools.
fs.file-max = 1048576
# Maximum inotify watches per user. inotify lets applications watch
# files for changes. Docker, K8s, and development tools (webpack,
# nodemon) use one watch per file/directory.
# Default: 8192. Docker with many containers and a K3s cluster with
# mounted configmaps will hit this fast.
# 524288 handles large container deployments without "inotify watch limit reached" errors.
fs.inotify.max_user_watches = 524288
# Maximum inotify instances per user. Each application watching files
# creates an instance.
# Default: 128. Raise alongside max_user_watches.
fs.inotify.max_user_instances = 1024 Alertmanager Config
Full Alertmanager alerting pipeline with routing tree, severity-based receivers for Slack/email/PagerDuty, inhibit rules to suppress redundant alerts, and notification templates.
# /etc/alertmanager/alertmanager.yml
# Alertmanager receives alerts from Prometheus and routes them to
# notification channels (Slack, email, PagerDuty, webhooks, etc).
#
# How the pipeline works:
# 1. Prometheus evaluates alerting rules and pushes firing alerts here
# 2. Alertmanager GROUPS related alerts (e.g., 5 instances of HighCPU)
# 3. Applies INHIBIT rules (suppress warnings when critical is firing)
# 4. Routes alerts through the ROUTING TREE to find the right receiver
# 5. Applies SILENCES (manual muting during maintenance)
# 6. Sends notifications via the matched RECEIVER
#
# Reload config: kill -HUP $(pidof alertmanager)
# Or: curl -X POST http://10.42.0.5:9093/-/reload
# ---------------------------------------------------------------
# Global settings — shared across all receivers
# ---------------------------------------------------------------
global:
# SMTP settings for email notifications.
# Used by all email receivers unless overridden per-receiver.
smtp_smarthost: 'smtp.example.com:587'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'SMTP_PASSWORD_HERE'
smtp_require_tls: true
# Slack webhook URL — global default used by all Slack receivers.
# Override per-receiver if you send to different channels.
slack_api_url: 'https://hooks.slack.com/services/T00/B00/XXXX'
# PagerDuty integration URL (usually the default, rarely changed).
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# How long to wait before sending a notification that an alert
# has been resolved. Prevents "firing/resolved" notification spam
# for flapping alerts.
resolve_timeout: 5m
# ---------------------------------------------------------------
# Notification templates
# Custom Go templates for formatting notification messages.
# Without custom templates, Alertmanager sends a generic format
# that's functional but not great for quick triage.
# ---------------------------------------------------------------
templates:
- '/etc/alertmanager/templates/*.tmpl'
# ---------------------------------------------------------------
# Routing tree
# Alerts enter at the top-level route and flow down through child
# routes. Each route can match on alert labels and send to a
# specific receiver. First match wins (depth-first search).
#
# Think of it as a decision tree:
# Is it critical? -> PagerDuty
# Is it a warning? -> Slack
# Is it info? -> Email digest
# Catch-all -> Slack (default)
# ---------------------------------------------------------------
route:
# Default receiver if no child route matches. Every alert that
# doesn't hit a more specific route ends up here.
receiver: 'slack-warnings'
# Group alerts by these labels. All alerts with the same values
# for alertname and cluster get bundled into a single notification.
# Without grouping, 50 instances of "NodeHighCPU" send 50 separate
# notifications. With grouping, you get one message listing all 50.
group_by: ['alertname', 'cluster', 'job']
# How long to wait for additional alerts in the same group before
# sending the first notification. This batches alerts that fire
# within the same time window (e.g., a cascading failure).
# 30s means "wait 30 seconds to collect related alerts before notifying."
group_wait: 30s
# Minimum time between notifications for the same alert group.
# After the first notification, wait at least 5 minutes before
# sending another update (even if new alerts join the group).
group_interval: 5m
# How long to wait before resending a notification for an alert
# that's still firing. Prevents notification fatigue from alerts
# that persist for hours/days.
# 4 hours: you get reminded about ongoing issues without being
# spammed every 5 minutes.
repeat_interval: 4h
# ---------------------------------------------------------------
# Child routes — ordered by severity
# ---------------------------------------------------------------
child_routes:
# Critical alerts go to PagerDuty for immediate paging.
# These are "wake someone up at 3 AM" severity.
- match:
severity: critical
receiver: 'pagerduty-critical'
# Critical alerts group faster (15s) because you need to know
# about cascading failures immediately, not 30 seconds later.
group_wait: 15s
group_interval: 5m
# Repeat every 1 hour for unresolved critical alerts.
# If it's still critical after an hour, you need another reminder.
repeat_interval: 1h
# Warning alerts go to a dedicated Slack channel.
# These are "look at this during business hours" severity.
- match:
severity: warning
receiver: 'slack-warnings'
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
# Info-level alerts go to email. These are "nice to know" events
# like backup completion, certificate renewals, or capacity reports.
# Grouped by alertname only (not cluster) to reduce email volume.
- match:
severity: info
receiver: 'email-info'
group_by: ['alertname']
group_wait: 1m
group_interval: 15m
# Only repeat daily — info alerts should not clutter your inbox.
repeat_interval: 24h
# Watchdog alert — Prometheus sends this continuously as a
# "dead man's switch." If you STOP receiving this alert,
# it means Prometheus or Alertmanager is down.
# Route it to a receiver that expects regular heartbeats.
- match:
alertname: Watchdog
receiver: 'null'
repeat_interval: 5m
# ---------------------------------------------------------------
# Receivers — where notifications actually go
# ---------------------------------------------------------------
receivers:
# Null receiver — swallows alerts silently. Used for Watchdog
# and any alerts you want to route but not notify on.
- name: 'null'
# PagerDuty for critical alerts (pages on-call)
- name: 'pagerduty-critical'
pagerduty_configs:
- service_key: 'PAGERDUTY_SERVICE_KEY_HERE'
severity: critical
# Include the instance and job labels in the PagerDuty
# incident for faster triage. The on-call engineer needs
# to know WHICH server and WHICH service without clicking
# through to Prometheus.
description: '{{ .CommonAnnotations.summary }}'
details:
firing: '{{ .Alerts.Firing | len }}'
resolved: '{{ .Alerts.Resolved | len }}'
cluster: '{{ .CommonLabels.cluster }}'
# Slack channel for warnings
- name: 'slack-warnings'
slack_configs:
- channel: '#homelab-alerts'
send_resolved: true
title: '{{ .CommonAnnotations.summary }}'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Labels.alertname }}
*Instance:* {{ .Labels.instance }}
*Severity:* {{ .Labels.severity }}
*Description:* {{ .Annotations.description }}
{{ end }}
# Color the Slack attachment by severity
color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}'
# Email digest for info-level alerts
- name: 'email-info'
email_configs:
- to: '[email protected]'
send_resolved: true
headers:
Subject: '[HomeLab] {{ .CommonLabels.alertname }} - {{ .Status }}'
# Plain text body with all alert details
text: >-
{{ range .Alerts }}
Alert: {{ .Labels.alertname }}
Instance: {{ .Labels.instance }}
Description: {{ .Annotations.description }}
Started: {{ .StartsAt }}
{{ end }}
# ---------------------------------------------------------------
# Inhibit rules — suppress redundant alerts
# When a critical alert is firing, suppress the corresponding
# warning alert for the same alertname and instance. No point
# getting both "disk almost full (warning)" and "disk full (critical)"
# at the same time — the critical one covers it.
# ---------------------------------------------------------------
inhibit_rules:
# If critical is firing, suppress warning for the same alert+instance
- source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
# Only suppress if these labels match between source and target.
# Without equal, a critical CPU alert on server A would suppress
# a warning disk alert on server B — not what you want.
equal: ['alertname', 'instance']
# If a node is down (NodeDown critical), suppress all other
# alerts from that node. A dead server generates dozens of
# derivative alerts (high latency, failed scrape, etc.) that
# are all symptoms of the same root cause.
- source_matchers:
- alertname = "NodeDown"
- severity = "critical"
target_matchers:
- severity =~ "warning|info"
equal: ['instance'] Grafana Provisioning
Grafana auto-provisioning for datasources (Prometheus, Loki, InfluxDB), dashboard file providers, notification channels, and key grafana.ini settings for security, SMTP, and embedding.
# Grafana Provisioning Configuration
# Grafana can auto-configure datasources, dashboards, and notification
# channels from YAML files on disk. This means you can version-control
# your entire Grafana setup and deploy it reproducibly without clicking
# through the UI.
#
# Provisioning files live in /etc/grafana/provisioning/ with this structure:
# /etc/grafana/provisioning/
# ├── datasources/
# │ └── datasources.yml (this section)
# ├── dashboards/
# │ └── dashboards.yml (this section)
# └── notifiers/
# └── notifiers.yml (this section)
#
# Dashboards themselves (JSON files) go in:
# /var/lib/grafana/dashboards/
#
# Folder organization tip: group dashboards by function:
# /var/lib/grafana/dashboards/
# ├── infrastructure/ (node exporter, Docker, networking)
# ├── applications/ (per-app dashboards)
# ├── databases/ (PostgreSQL, Redis metrics)
# └── overview/ (high-level summary boards)
# ===============================================================
# /etc/grafana/provisioning/datasources/datasources.yml
# ===============================================================
# Datasources define WHERE Grafana pulls data from.
# Each datasource maps to a backend (Prometheus, Loki, InfluxDB, etc).
# Version control these files so a fresh Grafana deploy has all
# sources pre-configured without manual UI setup.
apiVersion: 1
# deleteDatasources removes any datasource matching these names
# before applying the list below. Useful for cleaning up renamed
# or removed sources during reprovisioning.
deleteDatasources:
- name: Old-Prometheus
orgId: 1
datasources:
# -----------------------------------------------------------
# Prometheus — primary metrics source
# All infrastructure metrics (CPU, memory, disk, network,
# container stats, application metrics) flow through Prometheus.
# -----------------------------------------------------------
- name: Prometheus
type: prometheus
access: proxy
# Use the internal Docker/LAN address, not localhost.
# "proxy" mode means Grafana's backend makes the request,
# so the browser doesn't need direct access to Prometheus.
url: http://10.42.0.5:9090
# Default datasource — new panels use this unless changed.
isDefault: true
# Unique ID for referencing this source in provisioned dashboards.
# Must be consistent across deployments for dashboard JSON to work.
uid: prometheus-primary
jsonData:
# Time step for queries. Matches Prometheus scrape_interval
# so graphs align with actual data points.
timeInterval: '15s'
# HTTP method for queries. POST avoids URL length limits
# on complex PromQL queries.
httpMethod: POST
# Query timeout — fail fast if Prometheus is overloaded.
queryTimeout: '30s'
editable: false
# -----------------------------------------------------------
# Loki — log aggregation source
# Pairs with Promtail to collect and query container and
# system logs. LogQL syntax is inspired by PromQL.
# -----------------------------------------------------------
- name: Loki
type: loki
access: proxy
url: http://10.42.0.5:3100
uid: loki-primary
jsonData:
# Maximum number of log lines to return per query.
# Prevents accidentally loading millions of lines.
maxLines: 1000
# Link Loki log queries to Prometheus metric queries.
# Lets you click from a log line to the corresponding metrics.
derivedFields:
- datasourceUid: prometheus-primary
# Extract trace IDs from log lines for correlation.
matcherRegex: 'traceID=(\w+)'
name: TraceID
url: '${__value.raw}'
editable: false
# -----------------------------------------------------------
# InfluxDB — for long-term storage and IoT/sensor data
# InfluxDB handles high-cardinality time series better than
# Prometheus for certain workloads (home automation, weather
# stations, UPS metrics via NUT).
# -----------------------------------------------------------
- name: InfluxDB
type: influxdb
access: proxy
url: http://10.42.0.50:8086
uid: influxdb-primary
jsonData:
# InfluxDB 2.x uses Flux query language.
# Set to InfluxQL if using InfluxDB 1.x.
version: Flux
organization: homelab
defaultBucket: telegraf
secureJsonData:
# API token for InfluxDB 2.x authentication.
# Stored encrypted in Grafana's database.
token: INFLUXDB_TOKEN_HERE
editable: false
# ===============================================================
# /etc/grafana/provisioning/dashboards/dashboards.yml
# ===============================================================
# Dashboard providers tell Grafana where to find dashboard JSON files.
# Grafana watches these directories and auto-imports any .json files.
# Changes to JSON files on disk are picked up automatically.
#
# Community dashboards worth importing (by ID from grafana.com/dashboards):
# 1860 — Node Exporter Full (the gold standard for host monitoring)
# 14282 — cAdvisor (Docker container metrics)
# 13946 — Traefik 2 (reverse proxy metrics)
# 12633 — Loki + Promtail (log dashboard)
# 763 — Redis (cache monitoring)
#
# Download: grafana-cli plugins install or curl from grafana.com API
apiVersion: 1
providers:
- name: 'Infrastructure'
orgId: 1
# "file" provider reads JSON dashboards from the local filesystem.
# Other options: "database" for API-managed dashboards.
type: file
# disableDeletion prevents the UI from deleting provisioned dashboards.
# Users can still modify them, but Grafana resets to the file version
# on restart. Set to false if you want UI changes to persist.
disableDeletion: true
# updateIntervalSeconds controls how often Grafana re-reads the
# directory for changes. 30s is fast enough for development.
updateIntervalSeconds: 30
# allowUiUpdates lets users edit provisioned dashboards in the UI
# without saving back to disk. Useful for testing changes before
# committing the updated JSON to version control.
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/infrastructure
# foldersFromFilesStructure creates Grafana folders matching
# the subdirectory structure on disk. Keeps dashboards organized
# without manual folder assignment.
foldersFromFilesStructure: true
- name: 'Applications'
orgId: 1
type: file
disableDeletion: true
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/applications
foldersFromFilesStructure: true
# ===============================================================
# /etc/grafana/provisioning/notifiers/notifiers.yml
# ===============================================================
# Notification channels for Grafana's built-in alerting.
# Note: if you use Alertmanager (recommended), configure routing
# there instead. These notifiers are for Grafana-native alerts only.
notifiers:
- name: Slack Homelab
type: slack
uid: slack-homelab
org_id: 1
is_default: true
send_reminder: true
# Remind every 4 hours for unresolved alerts.
frequency: '4h'
settings:
url: 'https://hooks.slack.com/services/T00/B00/XXXX'
recipient: '#homelab-alerts'
mentionChannel: 'here'
# Include a screenshot of the alerting panel in notifications.
uploadImage: true
- name: Email Admin
type: email
uid: email-admin
org_id: 1
is_default: false
settings:
addresses: '[email protected]'
singleEmail: true
# ===============================================================
# Key grafana.ini settings
# /etc/grafana/grafana.ini
# ===============================================================
# These are the most important non-default settings for a homelab
# Grafana instance. The full file has hundreds of options; these
# are the ones that matter for security, usability, and integration.
#
# [server]
# # Public-facing URL (used in email links, OAuth redirects)
# root_url = https://grafana.example.com
# # Serve Grafana under a subpath if behind a reverse proxy
# # serve_from_sub_path = true
#
# [security]
# # Admin password (change from default 'admin' immediately)
# admin_password = CHANGE_ME
# # Disable the admin user creation prompt on first launch
# disable_initial_admin_creation = false
# # Allow embedding Grafana panels in iframes (for dashboards on TVs)
# allow_embedding = true
# # Cookie security
# cookie_secure = true
# cookie_samesite = strict
#
# [auth.anonymous]
# # Enable anonymous access for public dashboards (wall-mounted monitors)
# enabled = true
# # Anonymous users see this org
# org_name = HomeLab
# # Viewer role = read-only, no editing
# org_role = Viewer
#
# [smtp]
# enabled = true
# host = smtp.example.com:587
# user = [email protected]
# password = SMTP_PASSWORD
# from_address = [email protected]
# from_name = Grafana HomeLab
#
# [dashboards]
# # Minimum refresh interval. Prevents users from setting 1s refresh
# # which hammers the datasource.
# min_refresh_interval = 10s Loki + Promtail Config
Complete Loki log aggregation server config with TSDB storage, retention, and rate limits, paired with Promtail client config for Docker container logs, systemd journal, and static file scraping with pipeline stages.
# ===============================================================
# /etc/loki/loki.yml — Loki Log Aggregation Server
# ===============================================================
# Loki is a log aggregation system designed to be cost-effective
# and easy to operate. Unlike Elasticsearch/Splunk, Loki does NOT
# index the full text of log lines. It only indexes LABELS (metadata),
# which makes it much cheaper to run but means full-text search
# requires scanning chunks at query time.
#
# CRITICAL CONCEPT — Label Cardinality:
# Loki creates one "stream" per unique label combination.
# High-cardinality labels (user_id, request_id, IP address) create
# millions of streams and will destroy performance and storage.
#
# Good labels: {job="nginx", host="altair", level="error"}
# Bad labels: {job="nginx", request_id="abc123"} <- millions of unique values
#
# Rule of thumb: if a label has more than ~100 unique values, it
# should NOT be a label. Put it in the log line and use LogQL
# pattern/regex filters to search for it at query time.
auth_enabled: false
server:
http_listen_port: 3100
http_listen_address: 10.42.0.5
grpc_listen_port: 9096
# Log level for Loki itself. "warn" reduces noise in production.
# Use "debug" only when troubleshooting ingestion issues.
log_level: warn
# ---------------------------------------------------------------
# Ingester — receives log entries and builds compressed chunks
# ---------------------------------------------------------------
ingester:
wal:
# Write-Ahead Log directory. WAL ensures no data loss if Loki
# crashes before flushing chunks to storage. Loki replays the
# WAL on startup to recover in-flight data.
dir: /var/lib/loki/wal
enabled: true
lifecycler:
ring:
kvstore:
# "inmemory" for single-node. Use "consul" or "etcd" for
# multi-node Loki clusters.
store: inmemory
replication_factor: 1
# How long to wait before becoming ready after startup.
# Gives time for WAL replay to complete.
final_sleep: 0s
# How long to keep a chunk open in memory before flushing to
# storage. Longer = more data per chunk = better compression.
# Shorter = lower memory usage and faster data availability.
chunk_idle_period: 1h
# Maximum time a chunk stays open regardless of activity.
max_chunk_age: 2h
# Flush all chunks on shutdown for clean data persistence.
chunk_target_size: 1572864
chunk_retain_period: 30s
# ---------------------------------------------------------------
# Schema config — how Loki organizes data in storage
# ---------------------------------------------------------------
schema_config:
configs:
- # Start date for this schema version. Loki supports multiple
# schemas for migration purposes. New deployments use one entry.
from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
# Index tables rotate on this period. 24h creates one table
# per day, which makes retention/deletion efficient (drop
# entire tables instead of scanning for old entries).
prefix: loki_index_
period: 24h
# ---------------------------------------------------------------
# Storage — where chunks and indexes are stored
# ---------------------------------------------------------------
storage_config:
tsdb_shipper:
active_index_directory: /var/lib/loki/tsdb-index
cache_location: /var/lib/loki/tsdb-cache
filesystem:
# Local disk storage. For production clusters, use S3/GCS/MinIO.
# Filesystem works well for single-node homelab deployments.
directory: /var/lib/loki/chunks
# ---------------------------------------------------------------
# Compactor — manages retention and index compaction
# ---------------------------------------------------------------
compactor:
working_directory: /var/lib/loki/compactor
# Compaction merges small index files into larger ones for
# faster queries. Run frequently for active ingestion.
compaction_interval: 10m
# Enable retention enforcement. Without this, old data stays
# forever regardless of the limits_config.retention_period.
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: filesystem
# ---------------------------------------------------------------
# Limits — ingestion and query safety rails
# ---------------------------------------------------------------
limits_config:
# How long to keep log data before the compactor deletes it.
# 30 days is reasonable for a homelab. Adjust based on disk space.
# 1GB of compressed logs per day * 30 days = ~30GB storage.
retention_period: 720h
# Maximum rate of log ingestion per tenant (bytes/second).
# Prevents a single noisy container from overwhelming Loki.
ingestion_rate_mb: 16
ingestion_burst_size_mb: 32
# Maximum number of active streams per tenant. Each unique label
# combination creates a stream. If you hit this limit, revisit
# your labeling strategy — you probably have high-cardinality labels.
max_streams_per_user: 10000
# Maximum entries returned per query. Prevents accidental
# "give me all logs ever" queries from killing the server.
max_entries_limit_per_query: 5000
# Maximum query duration. Prevents runaway queries.
max_query_length: 721h
# Reject old samples. Logs older than 1 week are probably
# backfill from a misconfigured client, not real-time data.
reject_old_samples: true
reject_old_samples_max_age: 168h
# ---------------------------------------------------------------
# Query settings
# ---------------------------------------------------------------
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
# ===============================================================
# /etc/promtail/promtail.yml — Promtail Log Collection Agent
# ===============================================================
# Promtail runs on each host and ships logs to Loki.
# It tails log files, Docker container logs (via the Docker socket),
# and systemd journal entries.
#
# Promtail does NOT parse or index logs — it adds labels and
# sends raw lines to Loki. Pipeline stages can extract labels,
# parse timestamps, and filter lines before shipping.
# --- Promtail config begins below ---
# (Deploy as a separate file: /etc/promtail/promtail.yml)
# server:
# http_listen_port: 9080
# http_listen_address: 10.42.0.5
# grpc_listen_port: 0
#
# # Where Promtail stores its read position for each log file.
# # If Promtail restarts, it resumes from where it left off
# # instead of re-sending or skipping logs.
# positions:
# filename: /var/lib/promtail/positions.yml
#
# # Loki server to ship logs to
# clients:
# - url: http://10.42.0.5:3100/loki/api/v1/push
# # Batch settings: collect logs for up to 1s or 1MB before
# # sending a batch to Loki. Reduces HTTP overhead.
# batchwait: 1s
# batchsize: 1048576
# # Retry on failure with backoff
# backoff_config:
# min_period: 500ms
# max_period: 5m
# max_retries: 10
# # Tenant ID for multi-tenant Loki (leave empty for single-tenant)
# tenant_id: ""
#
# scrape_configs:
# # ---------------------------------------------------------
# # Docker container logs via the Docker socket
# # This is the recommended way to collect container logs.
# # Promtail connects to the Docker API and tails stdout/stderr
# # from each running container. Labels are auto-extracted from
# # container metadata (name, image, compose project, etc).
# # ---------------------------------------------------------
# - job_name: docker
# docker_sd_configs:
# - host: "unix:///var/run/docker.sock"
# refresh_interval: 5s
# # Only collect logs from containers with this label.
# # Prevents noisy utility containers from flooding Loki.
# filters:
# - name: label
# values: ["logging=promtail"]
# relabel_configs:
# # Use the container name as the "container" label in Loki.
# - source_labels: ['__meta_docker_container_name']
# regex: '/(.*)'
# target_label: 'container'
# # Use the compose service name if available
# - source_labels: ['__meta_docker_container_label_com_docker_compose_service']
# target_label: 'service'
# # Use the compose project name as the "project" label
# - source_labels: ['__meta_docker_container_label_com_docker_compose_project']
# target_label: 'project'
# pipeline_stages:
# # Docker JSON log format: each line is a JSON object with
# # "log", "stream", and "time" fields. The docker{} stage
# # unwraps this format automatically.
# - docker: {}
# # Extract log level from common patterns.
# # Matches: level=error, "level":"warn", [ERROR], etc.
# - regex:
# expression: '(?i)(?:level[=:]\s*"?)(?P<level>\w+)'
# - labels:
# level:
# # Parse the timestamp from the Docker JSON wrapper.
# # This ensures Loki stores the actual log timestamp,
# # not the time Promtail received the line.
# - timestamp:
# source: time
# format: RFC3339Nano
#
# # ---------------------------------------------------------
# # Systemd journal — system service logs
# # Captures logs from systemd services (sshd, cron, kernel, etc).
# # These don't go through Docker, so they need a separate scrape.
# # ---------------------------------------------------------
# - job_name: journal
# journal:
# # Read from the system journal
# path: /var/log/journal
# # Only forward entries from the last boot session.
# # Prevents replaying the entire journal history on restart.
# max_age: 12h
# labels:
# job: systemd-journal
# relabel_configs:
# # Extract the systemd unit name as a label.
# # e.g., sshd.service -> unit="sshd.service"
# - source_labels: ['__journal__systemd_unit']
# target_label: 'unit'
# - source_labels: ['__journal__hostname']
# target_label: 'hostname'
# pipeline_stages:
# # Extract syslog-style priority as log level
# - regex:
# expression: '(?P<level>emerg|alert|crit|err|warning|notice|info|debug)'
# - labels:
# level:
#
# # ---------------------------------------------------------
# # Static file paths — application logs not in Docker/systemd
# # For services that write to log files directly (Nginx access
# # logs, application logs, custom scripts).
# # ---------------------------------------------------------
# - job_name: static-files
# static_configs:
# - targets:
# - localhost
# labels:
# job: nginx
# host: altair
# __path__: /var/log/nginx/*.log
# - targets:
# - localhost
# labels:
# job: traefik
# host: altair
# __path__: /var/log/traefik/*.log
# pipeline_stages:
# # Parse Nginx combined log format into structured fields.
# # Extract method, status, and request path for filtering.
# - regex:
# expression: '^(?P<remote_addr>[\w.]+) - (?P<remote_user>\S+) \[(?P<time_local>[^\]]+)\] "(?P<method>\w+) (?P<request_path>\S+) \S+" (?P<status>\d+) (?P<body_bytes_sent>\d+)'
# # Only promote low-cardinality fields to labels.
# # method and status are fine (GET/POST/PUT + 200/404/500).
# # request_path is HIGH cardinality — do NOT label it.
# - labels:
# method:
# status:
# # Use the Nginx timestamp instead of Promtail's receive time
# - timestamp:
# source: time_local
# format: '02/Jan/2006:15:04:05 -0700'