Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# HuggingMes - Hermes Agent Gateway for Hugging Face Spaces

ARG HERMES_AGENT_VERSION=latest
FROM nousresearch/hermes-agent:${HERMES_AGENT_VERSION}
ARG HERMES_AGENT_VERSION
FROM nousresearch/hermes-agent:${HERMES_AGENT_VERSION:-latest}

USER root

Expand Down Expand Up @@ -53,11 +53,18 @@ COPY --chown=hermes:hermes cloudflare-keepalive-setup.py /opt/huggingmes/cloudfl
COPY --chown=hermes:hermes env-builder.html /opt/huggingmes/env-builder.html
COPY --chown=hermes:hermes env-builder.js /opt/huggingmes/env-builder.js

# s6 cont-init.d hook: aliases GATEWAY_TOKEN -> API_SERVER_KEY in the gateway's
# container_environment before main-hermes starts, so the gateway's API server
# (enabled via API_SERVER_ENABLED above) has the key it requires to bind 8642.
# Must stay root-owned (runs as root, writes the root-owned env dir).
COPY cont-init.d/016-huggingmes-api-server-key /etc/cont-init.d/016-huggingmes-api-server-key

RUN chmod +x \
/opt/huggingmes/start.sh \
/opt/huggingmes/hermes-sync.py \
/opt/huggingmes/cloudflare-proxy-setup.py \
/opt/huggingmes/cloudflare-keepalive-setup.py
/opt/huggingmes/cloudflare-keepalive-setup.py \
/etc/cont-init.d/016-huggingmes-api-server-key

# Patch kanban migration: wrap ALTER TABLE ADD COLUMN in try/except so a
# persisted DB with the column already present doesn't crash the gateway.
Expand Down Expand Up @@ -121,11 +128,24 @@ RUN find /opt/hermes -type d -exec chmod a+rwx {} + 2>/dev/null || true \
RUN echo 'export PATH="/opt/hermes/.venv/bin:/opt/data/.local/bin:$PATH"' \
> /etc/profile.d/hermes-venv.sh

# API_SERVER_* must be Docker ENV, not start.sh exports. The gateway runs as
# an independent s6-supervised service (main-hermes) that reads its environment
# from s6's container_environment — populated from PID 1's env (Docker ENV +
# runtime secrets) — NOT from start.sh's exports. The gateway enables its
# OpenAI-compatible API server (binds 127.0.0.1:8642) only when it sees
# API_SERVER_ENABLED=true (gateway/config.py). Without this the port stays
# unbound and the dashboard correctly reports "Gateway: Offline" even though
# telegram works (telegram doesn't depend on 8642). Proven by HERMES_HOME below:
# it reaches the gateway the same way. A runtime HF Space secret of the same
# name overrides these defaults.
ENV HERMES_HOME=/opt/data \
HUGGINGMES_APP_DIR=/opt/huggingmes \
HERMES_AGENT_VERSION=${HERMES_AGENT_VERSION} \
HERMES_AGENT_VERSION=${HERMES_AGENT_VERSION:-latest} \
PYTHONUNBUFFERED=1 \
PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium
PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium \
API_SERVER_ENABLED=true \
API_SERVER_HOST=127.0.0.1 \
API_SERVER_PORT=8642

EXPOSE 7861

Expand Down
53 changes: 53 additions & 0 deletions cont-init.d/016-huggingmes-api-server-key
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/command/with-contenv sh
# Alias GATEWAY_TOKEN -> API_SERVER_KEY for the gateway s6 service.
#
# The hermes gateway runs as an independent s6 service (main-hermes) that reads
# its environment from /run/s6/container_environment/ — NOT from start.sh's
# exports. With API_SERVER_ENABLED=true (set via Dockerfile ENV) the gateway
# tries to start its OpenAI-compatible API server on 127.0.0.1:8642, but
# gateway/platforms/api_server.py *refuses to start without API_SERVER_KEY*,
# even on loopback ("Refusing to start: API_SERVER_KEY is required"). Without a
# key the platform disconnects and port 8642 stays unbound -> dashboard shows
# "Gateway: Offline".
#
# GATEWAY_TOKEN is HuggingMes's single user-facing secret. Aliasing it to
# API_SERVER_KEY (a) satisfies the gateway's key requirement and (b) makes /v1
# auth match the token the health-server proxy already forwards — same value on
# both sides.
#
# Runs as root in cont-init.d. Numbered 016 so it runs AFTER 015-supervise-perms
# but BEFORE 02-reconcile-profiles — which auto-starts gateways whose persisted
# state was "running". The key must be in container_environment before that
# auto-start execs the gateway, or it loses the race and refuses to start.
# Deliberately cannot fail the boot: no `set -e`, every risky op guarded,
# explicit `exit 0`.
set -u

CE=/run/s6/container_environment

# Only act when: the env dir exists and the user hasn't already supplied an
# explicit API_SERVER_KEY (respect overrides).
if [ -d "$CE" ] && [ ! -s "$CE/API_SERVER_KEY" ]; then
if [ -s "$CE/GATEWAY_TOKEN" ]; then
# Preferred path: alias GATEWAY_TOKEN so /v1 auth matches the token the
# health-server proxy already forwards.
if cp "$CE/GATEWAY_TOKEN" "$CE/API_SERVER_KEY" 2>/dev/null; then
echo "[api-server-key] aliased GATEWAY_TOKEN -> API_SERVER_KEY for the gateway"
else
echo "[api-server-key] WARN: could not write API_SERVER_KEY; /v1 API server will stay disabled"
fi
else
# GATEWAY_TOKEN absent: generate an ephemeral key so the s6-supervised
# gateway can still start its API server. start.sh generates its own key
# too, but runs *after* cont-init, so the gateway would miss it through
# container_environment without this fallback.
if python3 -c 'import secrets; print(secrets.token_urlsafe(32), end="")' \
> "$CE/API_SERVER_KEY" 2>/dev/null; then
echo "[api-server-key] WARN: no GATEWAY_TOKEN — ephemeral API_SERVER_KEY generated for this boot"
else
echo "[api-server-key] WARN: could not generate ephemeral key; /v1 API server will stay disabled"
fi
fi
fi

exit 0
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ services:
HF_TOKEN: ${HF_TOKEN:-}
SPACE_HOST: ${SPACE_HOST:-localhost:7861}
DEV_MODE: ${DEV_MODE:-true}
HERMES_GATEWAY_NO_SUPERVISE: ${HERMES_GATEWAY_NO_SUPERVISE:-true}
volumes:
- huggingmes-data:/opt/data

Expand Down
81 changes: 64 additions & 17 deletions start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ DASHBOARD_PORT="${DASHBOARD_PORT:-9119}"
TELEGRAM_WEBHOOK_PORT="${TELEGRAM_WEBHOOK_PORT:-8765}"
SYNC_INTERVAL="${SYNC_INTERVAL:-600}"
BACKUP_DATASET="${BACKUP_DATASET_NAME:-huggingmes-backup}"
GATEWAY_HEALTH_INTERVAL="${GATEWAY_HEALTH_INTERVAL:-5}" # seconds between health polls
GATEWAY_HEALTH_FAILURES="${GATEWAY_HEALTH_FAILURES:-3}" # consecutive failures before restart
CF_PROXY_ENV_FILE="/tmp/huggingmes-cloudflare-proxy.env"
STARTUP_FILE="$HERMES_HOME/workspace/startup.sh"

Expand All @@ -25,6 +27,7 @@ export API_SERVER_HOST="${API_SERVER_HOST:-127.0.0.1}"
export API_SERVER_PORT="$GATEWAY_API_PORT"
export GATEWAY_HEALTH_URL="${GATEWAY_HEALTH_URL:-http://127.0.0.1:${GATEWAY_API_PORT}}"
export TELEGRAM_WEBHOOK_PORT
export HERMES_GATEWAY_NO_SUPERVISE="${HERMES_GATEWAY_NO_SUPERVISE:-true}"

echo ""
echo " ╔══════════════════════════════════════════╗"
Expand Down Expand Up @@ -352,6 +355,17 @@ echo "Dashboard : http://127.0.0.1:${DASHBOARD_PORT}"
echo "Gateway : http://127.0.0.1:${GATEWAY_API_PORT}"
echo ""

# Wait for a TCP port to stop being bound (pure bash, no lsof/fuser).
wait_for_port_free() {
local port="$1" timeout="${2:-30}" i
for ((i=0; i<timeout; i++)); do
(echo > "/dev/tcp/127.0.0.1/$port") 2>/dev/null || return 0
sleep 1
done
echo "Warning: port $port still bound after ${timeout}s; proceeding anyway." >&2
return 0
}

# ── JupyterLab terminal (on by default when GATEWAY_TOKEN is set) ──
JUPYTER_PID=""
start_jupyter() {
Expand Down Expand Up @@ -405,12 +419,25 @@ start_jupyter() {
# ── Trap SIGTERM for graceful shutdown ──
SYNC_LOOP_PID=""
DASHBOARD_PID=""
SHUTTING_DOWN=""
graceful_shutdown() {
SHUTTING_DOWN=1
echo "Shutting down HuggingMes..."
if [ -n "${HF_TOKEN:-}" ]; then
python3 "$APP_DIR/hermes-sync.py" sync-once || echo "Warning: shutdown sync failed."
fi
kill $(jobs -p) 2>/dev/null || true
# Stop gateway via CLI so hermes sets gateway_state=stopped.
# This prevents 02-reconcile-profiles from auto-starting it on the next container boot.
timeout 5 hermes gateway stop 2>/dev/null || true
for pid in "${SYNC_LOOP_PID:-}" "${DASHBOARD_PID:-}" "${JUPYTER_PID:-}"; do
[ -n "$pid" ] && kill -TERM "$pid" 2>/dev/null || true
done
local deadline=$((SECONDS + 10))
while [[ -n $(jobs -p 2>/dev/null) ]]; do
[ "$SECONDS" -ge "$deadline" ] && break
sleep 1
done
kill -KILL $(jobs -p 2>/dev/null) 2>/dev/null || true
exit 0
}
trap graceful_shutdown SIGTERM SIGINT
Expand Down Expand Up @@ -775,37 +802,57 @@ while true; do
start_jupyter
fi

echo "Launching Hermes gateway..."
(hermes gateway run 2>&1 | tee -a "$HERMES_HOME/logs/gateway.log") &
GATEWAY_PID=$!
[ -n "${SHUTTING_DOWN:-}" ] && break

# ── Launch or attach ──
# `hermes gateway run` exits immediately after handing off to s6-supervise.
# Use `hermes gateway restart` on subsequent iterations — `run` is refused when already supervised.
if ! (echo > "/dev/tcp/127.0.0.1/${GATEWAY_API_PORT}") 2>/dev/null; then
if [ "$GATEWAY_RESTART_COUNT" -eq 0 ]; then
echo "Launching Hermes gateway..."
wait_for_port_free "$GATEWAY_API_PORT"
hermes gateway run >> "$HERMES_HOME/logs/gateway.log" 2>&1 || true
else
echo "Restarting Hermes gateway (attempt ${GATEWAY_RESTART_COUNT})..."
hermes gateway restart >> "$HERMES_HOME/logs/gateway.log" 2>&1 || true
fi
fi

# ── Wait for readiness ──
ready=false
for ((i=0; i<GATEWAY_READY_TIMEOUT; i++)); do
[ -n "${SHUTTING_DOWN:-}" ] && break
if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_API_PORT}") 2>/dev/null; then
ready=true
break
fi
if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then
break
ready=true; break
fi
sleep 1
done

if [ "$ready" != "true" ]; then
[ -n "${SHUTTING_DOWN:-}" ] && exit 0
echo ""
echo "Hermes gateway failed to expose the API health port. Last 40 log lines:"
echo "----------------------------------------"
tail -40 "$HERMES_HOME/logs/gateway.log" || true
exit 1
fi

# Start sync loop (only once — shared across all gateway restarts)
start_background_sync_once

set +e
wait "$GATEWAY_PID"
GATEWAY_EXIT_CODE=$?
set -e
# ── Monitor via health endpoint ──
# GATEWAY_PID is not useful: the registration wrapper exits immediately after s6 hand-off.
GATEWAY_FAIL_COUNT=0
while true; do
[ -n "${SHUTTING_DOWN:-}" ] && break 2
sleep "${GATEWAY_HEALTH_INTERVAL}"
if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_API_PORT}") 2>/dev/null; then
GATEWAY_FAIL_COUNT=0
else
GATEWAY_FAIL_COUNT=$((GATEWAY_FAIL_COUNT + 1))
echo "Gateway health miss ${GATEWAY_FAIL_COUNT}/${GATEWAY_HEALTH_FAILURES}..."
[ "$GATEWAY_FAIL_COUNT" -ge "$GATEWAY_HEALTH_FAILURES" ] && break
fi
done

# Sync state before restart
if [ -n "${HF_TOKEN:-}" ]; then
Expand All @@ -815,10 +862,10 @@ while true; do

GATEWAY_RESTART_COUNT=$((GATEWAY_RESTART_COUNT + 1))
if [ "$GATEWAY_MAX_RESTARTS" != "0" ] && [ "$GATEWAY_RESTART_COUNT" -ge "$GATEWAY_MAX_RESTARTS" ]; then
echo "Gateway exited (code ${GATEWAY_EXIT_CODE}); restart limit (${GATEWAY_MAX_RESTARTS}) reached."
exit "$GATEWAY_EXIT_CODE"
echo "Gateway exited; restart limit (${GATEWAY_MAX_RESTARTS}) reached."
exit 1
fi

echo "Gateway exited (code ${GATEWAY_EXIT_CODE}); restarting in ${GATEWAY_RESTART_DELAY}s..."
echo "Gateway exited; restarting in ${GATEWAY_RESTART_DELAY}s..."
sleep "$GATEWAY_RESTART_DELAY"
done