diff --git a/Dockerfile b/Dockerfile index 7b7d4e4..745efc7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # HuggingMes - Hermes Agent Gateway for Hugging Face Spaces -ARG HERMES_AGENT_VERSION=latest -FROM nousresearch/hermes-agent:${HERMES_AGENT_VERSION} +ARG HERMES_AGENT_VERSION +FROM nousresearch/hermes-agent:${HERMES_AGENT_VERSION:-latest} USER root @@ -53,11 +53,18 @@ COPY --chown=hermes:hermes cloudflare-keepalive-setup.py /opt/huggingmes/cloudfl COPY --chown=hermes:hermes env-builder.html /opt/huggingmes/env-builder.html COPY --chown=hermes:hermes env-builder.js /opt/huggingmes/env-builder.js +# s6 cont-init.d hook: aliases GATEWAY_TOKEN -> API_SERVER_KEY in the gateway's +# container_environment before main-hermes starts, so the gateway's API server +# (enabled via API_SERVER_ENABLED above) has the key it requires to bind 8642. +# Must stay root-owned (runs as root, writes the root-owned env dir). +COPY cont-init.d/016-huggingmes-api-server-key /etc/cont-init.d/016-huggingmes-api-server-key + RUN chmod +x \ /opt/huggingmes/start.sh \ /opt/huggingmes/hermes-sync.py \ /opt/huggingmes/cloudflare-proxy-setup.py \ - /opt/huggingmes/cloudflare-keepalive-setup.py + /opt/huggingmes/cloudflare-keepalive-setup.py \ + /etc/cont-init.d/016-huggingmes-api-server-key # Patch kanban migration: wrap ALTER TABLE ADD COLUMN in try/except so a # persisted DB with the column already present doesn't crash the gateway. @@ -121,11 +128,24 @@ RUN find /opt/hermes -type d -exec chmod a+rwx {} + 2>/dev/null || true \ RUN echo 'export PATH="/opt/hermes/.venv/bin:/opt/data/.local/bin:$PATH"' \ > /etc/profile.d/hermes-venv.sh +# API_SERVER_* must be Docker ENV, not start.sh exports. The gateway runs as +# an independent s6-supervised service (main-hermes) that reads its environment +# from s6's container_environment — populated from PID 1's env (Docker ENV + +# runtime secrets) — NOT from start.sh's exports. The gateway enables its +# OpenAI-compatible API server (binds 127.0.0.1:8642) only when it sees +# API_SERVER_ENABLED=true (gateway/config.py). Without this the port stays +# unbound and the dashboard correctly reports "Gateway: Offline" even though +# telegram works (telegram doesn't depend on 8642). Proven by HERMES_HOME below: +# it reaches the gateway the same way. A runtime HF Space secret of the same +# name overrides these defaults. ENV HERMES_HOME=/opt/data \ HUGGINGMES_APP_DIR=/opt/huggingmes \ - HERMES_AGENT_VERSION=${HERMES_AGENT_VERSION} \ + HERMES_AGENT_VERSION=${HERMES_AGENT_VERSION:-latest} \ PYTHONUNBUFFERED=1 \ - PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium + PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium \ + API_SERVER_ENABLED=true \ + API_SERVER_HOST=127.0.0.1 \ + API_SERVER_PORT=8642 EXPOSE 7861 diff --git a/cont-init.d/016-huggingmes-api-server-key b/cont-init.d/016-huggingmes-api-server-key new file mode 100644 index 0000000..617de0f --- /dev/null +++ b/cont-init.d/016-huggingmes-api-server-key @@ -0,0 +1,53 @@ +#!/command/with-contenv sh +# Alias GATEWAY_TOKEN -> API_SERVER_KEY for the gateway s6 service. +# +# The hermes gateway runs as an independent s6 service (main-hermes) that reads +# its environment from /run/s6/container_environment/ — NOT from start.sh's +# exports. With API_SERVER_ENABLED=true (set via Dockerfile ENV) the gateway +# tries to start its OpenAI-compatible API server on 127.0.0.1:8642, but +# gateway/platforms/api_server.py *refuses to start without API_SERVER_KEY*, +# even on loopback ("Refusing to start: API_SERVER_KEY is required"). Without a +# key the platform disconnects and port 8642 stays unbound -> dashboard shows +# "Gateway: Offline". +# +# GATEWAY_TOKEN is HuggingMes's single user-facing secret. Aliasing it to +# API_SERVER_KEY (a) satisfies the gateway's key requirement and (b) makes /v1 +# auth match the token the health-server proxy already forwards — same value on +# both sides. +# +# Runs as root in cont-init.d. Numbered 016 so it runs AFTER 015-supervise-perms +# but BEFORE 02-reconcile-profiles — which auto-starts gateways whose persisted +# state was "running". The key must be in container_environment before that +# auto-start execs the gateway, or it loses the race and refuses to start. +# Deliberately cannot fail the boot: no `set -e`, every risky op guarded, +# explicit `exit 0`. +set -u + +CE=/run/s6/container_environment + +# Only act when: the env dir exists and the user hasn't already supplied an +# explicit API_SERVER_KEY (respect overrides). +if [ -d "$CE" ] && [ ! -s "$CE/API_SERVER_KEY" ]; then + if [ -s "$CE/GATEWAY_TOKEN" ]; then + # Preferred path: alias GATEWAY_TOKEN so /v1 auth matches the token the + # health-server proxy already forwards. + if cp "$CE/GATEWAY_TOKEN" "$CE/API_SERVER_KEY" 2>/dev/null; then + echo "[api-server-key] aliased GATEWAY_TOKEN -> API_SERVER_KEY for the gateway" + else + echo "[api-server-key] WARN: could not write API_SERVER_KEY; /v1 API server will stay disabled" + fi + else + # GATEWAY_TOKEN absent: generate an ephemeral key so the s6-supervised + # gateway can still start its API server. start.sh generates its own key + # too, but runs *after* cont-init, so the gateway would miss it through + # container_environment without this fallback. + if python3 -c 'import secrets; print(secrets.token_urlsafe(32), end="")' \ + > "$CE/API_SERVER_KEY" 2>/dev/null; then + echo "[api-server-key] WARN: no GATEWAY_TOKEN — ephemeral API_SERVER_KEY generated for this boot" + else + echo "[api-server-key] WARN: could not generate ephemeral key; /v1 API server will stay disabled" + fi + fi +fi + +exit 0 diff --git a/docker-compose.yml b/docker-compose.yml index aae5c0a..9db4993 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,7 @@ services: HF_TOKEN: ${HF_TOKEN:-} SPACE_HOST: ${SPACE_HOST:-localhost:7861} DEV_MODE: ${DEV_MODE:-true} + HERMES_GATEWAY_NO_SUPERVISE: ${HERMES_GATEWAY_NO_SUPERVISE:-true} volumes: - huggingmes-data:/opt/data diff --git a/start.sh b/start.sh index 505d556..1e2a2e1 100755 --- a/start.sh +++ b/start.sh @@ -16,6 +16,8 @@ DASHBOARD_PORT="${DASHBOARD_PORT:-9119}" TELEGRAM_WEBHOOK_PORT="${TELEGRAM_WEBHOOK_PORT:-8765}" SYNC_INTERVAL="${SYNC_INTERVAL:-600}" BACKUP_DATASET="${BACKUP_DATASET_NAME:-huggingmes-backup}" +GATEWAY_HEALTH_INTERVAL="${GATEWAY_HEALTH_INTERVAL:-5}" # seconds between health polls +GATEWAY_HEALTH_FAILURES="${GATEWAY_HEALTH_FAILURES:-3}" # consecutive failures before restart CF_PROXY_ENV_FILE="/tmp/huggingmes-cloudflare-proxy.env" STARTUP_FILE="$HERMES_HOME/workspace/startup.sh" @@ -25,6 +27,7 @@ export API_SERVER_HOST="${API_SERVER_HOST:-127.0.0.1}" export API_SERVER_PORT="$GATEWAY_API_PORT" export GATEWAY_HEALTH_URL="${GATEWAY_HEALTH_URL:-http://127.0.0.1:${GATEWAY_API_PORT}}" export TELEGRAM_WEBHOOK_PORT +export HERMES_GATEWAY_NO_SUPERVISE="${HERMES_GATEWAY_NO_SUPERVISE:-true}" echo "" echo " ╔══════════════════════════════════════════╗" @@ -352,6 +355,17 @@ echo "Dashboard : http://127.0.0.1:${DASHBOARD_PORT}" echo "Gateway : http://127.0.0.1:${GATEWAY_API_PORT}" echo "" +# Wait for a TCP port to stop being bound (pure bash, no lsof/fuser). +wait_for_port_free() { + local port="$1" timeout="${2:-30}" i + for ((i=0; i "/dev/tcp/127.0.0.1/$port") 2>/dev/null || return 0 + sleep 1 + done + echo "Warning: port $port still bound after ${timeout}s; proceeding anyway." >&2 + return 0 +} + # ── JupyterLab terminal (on by default when GATEWAY_TOKEN is set) ── JUPYTER_PID="" start_jupyter() { @@ -405,12 +419,25 @@ start_jupyter() { # ── Trap SIGTERM for graceful shutdown ── SYNC_LOOP_PID="" DASHBOARD_PID="" +SHUTTING_DOWN="" graceful_shutdown() { + SHUTTING_DOWN=1 echo "Shutting down HuggingMes..." if [ -n "${HF_TOKEN:-}" ]; then python3 "$APP_DIR/hermes-sync.py" sync-once || echo "Warning: shutdown sync failed." fi - kill $(jobs -p) 2>/dev/null || true + # Stop gateway via CLI so hermes sets gateway_state=stopped. + # This prevents 02-reconcile-profiles from auto-starting it on the next container boot. + timeout 5 hermes gateway stop 2>/dev/null || true + for pid in "${SYNC_LOOP_PID:-}" "${DASHBOARD_PID:-}" "${JUPYTER_PID:-}"; do + [ -n "$pid" ] && kill -TERM "$pid" 2>/dev/null || true + done + local deadline=$((SECONDS + 10)) + while [[ -n $(jobs -p 2>/dev/null) ]]; do + [ "$SECONDS" -ge "$deadline" ] && break + sleep 1 + done + kill -KILL $(jobs -p 2>/dev/null) 2>/dev/null || true exit 0 } trap graceful_shutdown SIGTERM SIGINT @@ -775,23 +802,34 @@ while true; do start_jupyter fi - echo "Launching Hermes gateway..." - (hermes gateway run 2>&1 | tee -a "$HERMES_HOME/logs/gateway.log") & - GATEWAY_PID=$! + [ -n "${SHUTTING_DOWN:-}" ] && break + # ── Launch or attach ── + # `hermes gateway run` exits immediately after handing off to s6-supervise. + # Use `hermes gateway restart` on subsequent iterations — `run` is refused when already supervised. + if ! (echo > "/dev/tcp/127.0.0.1/${GATEWAY_API_PORT}") 2>/dev/null; then + if [ "$GATEWAY_RESTART_COUNT" -eq 0 ]; then + echo "Launching Hermes gateway..." + wait_for_port_free "$GATEWAY_API_PORT" + hermes gateway run >> "$HERMES_HOME/logs/gateway.log" 2>&1 || true + else + echo "Restarting Hermes gateway (attempt ${GATEWAY_RESTART_COUNT})..." + hermes gateway restart >> "$HERMES_HOME/logs/gateway.log" 2>&1 || true + fi + fi + + # ── Wait for readiness ── ready=false for ((i=0; i "/dev/tcp/127.0.0.1/${GATEWAY_API_PORT}") 2>/dev/null; then - ready=true - break - fi - if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then - break + ready=true; break fi sleep 1 done if [ "$ready" != "true" ]; then + [ -n "${SHUTTING_DOWN:-}" ] && exit 0 echo "" echo "Hermes gateway failed to expose the API health port. Last 40 log lines:" echo "----------------------------------------" @@ -799,13 +837,22 @@ while true; do exit 1 fi - # Start sync loop (only once — shared across all gateway restarts) start_background_sync_once - set +e - wait "$GATEWAY_PID" - GATEWAY_EXIT_CODE=$? - set -e + # ── Monitor via health endpoint ── + # GATEWAY_PID is not useful: the registration wrapper exits immediately after s6 hand-off. + GATEWAY_FAIL_COUNT=0 + while true; do + [ -n "${SHUTTING_DOWN:-}" ] && break 2 + sleep "${GATEWAY_HEALTH_INTERVAL}" + if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_API_PORT}") 2>/dev/null; then + GATEWAY_FAIL_COUNT=0 + else + GATEWAY_FAIL_COUNT=$((GATEWAY_FAIL_COUNT + 1)) + echo "Gateway health miss ${GATEWAY_FAIL_COUNT}/${GATEWAY_HEALTH_FAILURES}..." + [ "$GATEWAY_FAIL_COUNT" -ge "$GATEWAY_HEALTH_FAILURES" ] && break + fi + done # Sync state before restart if [ -n "${HF_TOKEN:-}" ]; then @@ -815,10 +862,10 @@ while true; do GATEWAY_RESTART_COUNT=$((GATEWAY_RESTART_COUNT + 1)) if [ "$GATEWAY_MAX_RESTARTS" != "0" ] && [ "$GATEWAY_RESTART_COUNT" -ge "$GATEWAY_MAX_RESTARTS" ]; then - echo "Gateway exited (code ${GATEWAY_EXIT_CODE}); restart limit (${GATEWAY_MAX_RESTARTS}) reached." - exit "$GATEWAY_EXIT_CODE" + echo "Gateway exited; restart limit (${GATEWAY_MAX_RESTARTS}) reached." + exit 1 fi - echo "Gateway exited (code ${GATEWAY_EXIT_CODE}); restarting in ${GATEWAY_RESTART_DELAY}s..." + echo "Gateway exited; restarting in ${GATEWAY_RESTART_DELAY}s..." sleep "$GATEWAY_RESTART_DELAY" done