Skip to content

Instantly share code, notes, and snippets.

@knowlet
Created March 16, 2026 02:54
Show Gist options
  • Select an option

  • Save knowlet/d01ef6666233069d082eaa326fe74d54 to your computer and use it in GitHub Desktop.

Select an option

Save knowlet/d01ef6666233069d082eaa326fe74d54 to your computer and use it in GitHub Desktop.
litellm + autoheal + traefik + cloudflare
services:
traefik:
image: traefik:v3.6.6
command:
- --providers.docker=true
- --providers.docker.exposedbydefault=false
- --providers.file.filename=/etc/traefik/dynamic.yml
- --entrypoints.websecure.address=:443
- --entrypoints.metrics.address=:9100
- --metrics.prometheus=true
- --metrics.prometheus.entrypoint=metrics
- --metrics.prometheus.addEntryPointsLabels=true
- --metrics.prometheus.addRoutersLabels=true
- --metrics.prometheus.addServicesLabels=true
ports:
- "443:443"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./traefik_dynamic.yml:/etc/traefik/dynamic.yml:ro
- ./certs:/certs:ro
restart: unless-stopped
autoheal:
deploy:
replicas: 1
image: willfarrell/autoheal:latest
network_mode: none
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /var/run/docker.sock:/var/run/docker.sock
litellm:
restart: unless-stopped
build:
context: .
args:
target: runtime
image: ghcr.io/berriai/litellm:main-stable
expose:
- 4000
volumes:
- ./config.yaml:/app/config.yaml
command:
- "--config=/app/config.yaml"
environment:
DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
env_file:
- .env # Load local .env file
depends_on:
- db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
healthcheck: # Defines the health check configuration for the container
test:
- CMD-SHELL
# - python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:4000/health/readness')" # Command to execute for health check
- python3 -c "import requests, sys, json; r=requests.get('http://localhost:4000/health/readiness', timeout=5); data=r.json(); assert data['status']=='connected' and data.get('db')=='connected';" || exit 1
interval: 30s # Perform health check every 30 seconds
timeout: 10s # Health check command times out after 10 seconds
retries: 3 # Retry up to 3 times if health check fails
start_period: 40s # Wait 40 seconds after container start before beginning health checks
labels:
- "traefik.enable=true"
- "traefik.http.routers.litellm.rule=Host(`ai.lkc-lab.com`)"
- "traefik.http.routers.litellm.entrypoints=websecure"
- "traefik.http.routers.litellm.tls=true"
- "autoheal=true"
db:
image: postgres:16
restart: always
container_name: litellm_db
environment:
POSTGRES_DB: litellm
POSTGRES_USER: llmproxy
POSTGRES_PASSWORD: dbpassword9090
volumes:
- postgres_data:/var/lib/postgresql/data # Persists Postgres data across container restarts
healthcheck:
test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
interval: 1s
timeout: 5s
retries: 10
prometheus:
image: prom/prometheus
volumes:
- prometheus_data:/prometheus
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=15d"
restart: always
volumes:
prometheus_data:
driver: local
postgres_data:
name: litellm_postgres_data # Named volume for Postgres data persistence
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment