ClickHouse · pufit · Jun 24, 2026 · Jun 24, 2026
diff --git a/config.example.yaml b/config.example.yaml
@@ -11,6 +11,25 @@ timezone: America/New_York
 #   aws_region: us-east-1      # AWS region for Bedrock
 #   # aws_profile: ""          # Optional: AWS SSO profile name
 
+# Local proxy (CLIProxyAPI) — optional. Routes Anthropic API calls through
+# Claude Code OAuth, and is the Anthropic↔OpenAI translation layer that local
+# Ollama models are reached through. Required for the ollama block below.
+# proxy:
+#   enabled: false
+#   port: 8317
+#   host: 127.0.0.1
+
+# Local Ollama — expose locally-installed Ollama models as selectable chat
+# models in the web composer's model picker. Ollama speaks an OpenAI-compatible
+# API, so this requires proxy.enabled: true (the proxy translates the
+# Anthropic requests the SDK emits). Models are auto-discovered at runtime
+# from the running Ollama server (GET /api/tags) — whatever you've pulled
+# locally shows up automatically, no need to list models here.
+# ollama:
+#   enabled: false
+#   host: 127.0.0.1
+#   port: 11434
+
 # Agent
 agent:
   model: claude-opus-4-8         # Primary model for conversations

diff --git a/nerve/agent/engine.py b/nerve/agent/engine.py
@@ -327,6 +327,10 @@ def __init__(self, config: NerveConfig, db: Database):
         # Read by session-scoped tools (send_file) to avoid dispatching via
         # stale router context from a prior inbound channel.
         self._active_channel: dict[str, str] = {}
+        # Resolved model bound to each session's live SDK client. Used to
+        # detect mid-session model switches (the CLI fixes its model at
+        # connect time, so a change requires recreating the client).
+        self._session_models: dict[str, str] = {}
         self._router = None  # ChannelRouter — lazy-initialized via .router property
         self._mcp_servers_cache = list(config.mcp_servers)  # hot-reloadable
         self._claude_code_plugins: list[dict[str, str]] = []  # plugin dirs
@@ -973,19 +977,28 @@ def _build_options(
         else:
             system_prompt = system_prompt_str
 
-        thinking_config = self._parse_thinking_config(
-            self.config.agent.thinking,
-            model or self.config.agent.model,
+        # Local Ollama models are reached through the proxy and speak the
+        # OpenAI-translated API — Anthropic-only knobs (extended thinking,
+        # effort, the context-1m beta) don't apply and may break translation,
+        # so suppress them for non-Claude models.
+        selected_model = model or self.config.agent.model
+        is_ollama_model = (
+            self.config.ollama.enabled and "claude" not in selected_model.lower()
         )
-        effort = self._effective_effort(
-            self.config.agent.effort,
-            model or self.config.agent.model,
+
+        thinking_config = (
+            None if is_ollama_model
+            else self._parse_thinking_config(self.config.agent.thinking, selected_model)
+        )
+        effort = (
+            None if is_ollama_model
+            else self._effective_effort(self.config.agent.effort, selected_model)
         )
         # Some subscriptions reject the context-1m beta for specific models
         # (e.g. claude-sonnet-4-6) — skip the beta header for those.
         betas = (
             ["context-1m-2025-08-07"]
-            if self.config.agent.context_1m_enabled_for(model)
+            if not is_ollama_model and self.config.agent.context_1m_enabled_for(model)
             else []
         )
 
@@ -1468,7 +1481,9 @@ async def _get_or_create_client(
         lock = self.sessions.get_lock(session_id)
         async with lock:
             client = self.sessions.get_client(session_id)
+            requested_model = model or self.config.agent.model
             if client is not None:
+                bound_model = self._session_models.get(session_id)
                 # Health check: verify the underlying CLI process is still alive
                 if self._is_client_dead(client):
                     logger.warning(
@@ -1480,6 +1495,20 @@ async def _get_or_create_client(
                     unregister_handler(session_id)
                     await self._safe_disconnect(client)
                     client = None
+                elif bound_model is not None and bound_model != requested_model:
+                    # Model switched mid-session (e.g. the composer's picker
+                    # moved from the Anthropic default to a local Ollama
+                    # model). The CLI binds its model at connect time, so
+                    # tear the client down and recreate it below.
+                    logger.info(
+                        "Session %s model changed (%s → %s), recreating client",
+                        session_id, bound_model, requested_model,
+                    )
+                    self._stop_idle_watcher(session_id)
+                    self.sessions.remove_client(session_id)
+                    unregister_handler(session_id)
+                    await self._safe_disconnect(client)
+                    client = None
                 else:
                     return client
 
@@ -1568,6 +1597,7 @@ async def _get_or_create_client(
 
             # Record connected_at and the resolved model
             resolved_model = options.model
+            self._session_models[session_id] = resolved_model
             now = datetime.now(timezone.utc).isoformat()
             connected_at = session.get("connected_at") if session and sdk_resume_id else now
             await self.sessions.mark_active(

diff --git a/nerve/config.py b/nerve/config.py
@@ -717,6 +717,48 @@ def from_dict(cls, d: dict) -> ProxyConfig:
         )
 
 
+@dataclass
+class OllamaConfig:
+    """Local Ollama server — exposes its models as selectable chat models.
+
+    Ollama speaks an OpenAI-compatible API (``/v1``), not the Anthropic
+    Messages API the Claude Agent SDK uses. So Ollama models are routed
+    through the bundled CLIProxyAPI, which translates Anthropic ↔ OpenAI
+    and is registered with Ollama as an ``openai-compatibility`` upstream.
+
+    Requirement: this only takes effect when the proxy is also enabled
+    (``proxy.enabled: true``) — the proxy is the translation layer. When
+    ``enabled`` is true but the proxy is off, Ollama models are not offered
+    (a warning is logged at startup).
+
+    Models are auto-discovered at runtime from Ollama's native
+    ``GET /api/tags`` endpoint, so whatever you have pulled locally shows
+    up in the model picker with no extra config.
+    """
+
+    enabled: bool = False
+    host: str = "127.0.0.1"
+    port: int = 11434
+
+    @property
+    def base_url(self) -> str:
+        """Native Ollama base URL (used for ``/api/tags`` discovery)."""
+        return f"http://{self.host}:{self.port}"
+
+    @property
+    def openai_base_url(self) -> str:
+        """OpenAI-compatible base URL (registered as a proxy upstream)."""
+        return f"http://{self.host}:{self.port}/v1"
+
+    @classmethod
+    def from_dict(cls, d: dict) -> OllamaConfig:
+        return cls(
+            enabled=bool(d.get("enabled", False)),
+            host=d.get("host", "127.0.0.1"),
+            port=int(d.get("port", 11434)),
+        )
+
+
 @dataclass
 class McpEndpointConfig:
     """Nerve's own MCP server endpoint (Nerve-as-MCP-server).
@@ -1063,6 +1105,7 @@ class NerveConfig:
     notifications: NotificationsConfig = field(default_factory=NotificationsConfig)
     docker: DockerConfig = field(default_factory=DockerConfig)
     proxy: ProxyConfig = field(default_factory=ProxyConfig)
+    ollama: OllamaConfig = field(default_factory=OllamaConfig)
     houseofagents: HouseOfAgentsConfig = field(default_factory=HouseOfAgentsConfig)
     langfuse: LangfuseConfig = field(default_factory=LangfuseConfig)
     xmemory: XmemoryConfig = field(default_factory=XmemoryConfig)
@@ -1098,6 +1141,15 @@ def effective_api_key(self) -> str:
             return self.proxy.api_key
         return self.anthropic_api_key
 
+    @property
+    def ollama_routable(self) -> bool:
+        """True when Ollama models can actually be served.
+
+        Requires both Ollama enabled and the proxy running (the proxy is
+        the Anthropic↔OpenAI translation layer Ollama is reached through).
+        """
+        return self.ollama.enabled and self.proxy.enabled
+
     def create_anthropic_client(self, timeout: float = 60.0) -> Any:
         """Create an Anthropic client based on the configured provider.
 
@@ -1180,6 +1232,7 @@ def from_dict(cls, d: dict) -> NerveConfig:
             notifications=NotificationsConfig.from_dict(d.get("notifications", {})),
             docker=DockerConfig.from_dict(d.get("docker", {})),
             proxy=ProxyConfig.from_dict(d.get("proxy", {})),
+            ollama=OllamaConfig.from_dict(d.get("ollama", {})),
             houseofagents=HouseOfAgentsConfig.from_dict(d.get("houseofagents", {})),
             langfuse=LangfuseConfig.from_dict(d.get("langfuse", {})),
             xmemory=XmemoryConfig.from_dict(d.get("xmemory", {})),

diff --git a/nerve/gateway/routes/__init__.py b/nerve/gateway/routes/__init__.py
@@ -32,6 +32,7 @@
     files,
     external_agents,
     prompt_rewrite,
+    models,
 )
 
 __all__ = [
@@ -61,4 +62,5 @@ def register_all_routes() -> APIRouter:
     router.include_router(files.router)
     router.include_router(external_agents.router)
     router.include_router(prompt_rewrite.router)
+    router.include_router(models.router)
     return router
diff --git a/nerve/gateway/routes/models.py b/nerve/gateway/routes/models.py
@@ -0,0 +1,65 @@
+"""Model discovery routes — which chat models the UI can offer.
+
+Exposes the configured Anthropic chat model plus any locally-installed
+Ollama models (auto-discovered from the running Ollama server). The web
+composer's model picker calls GET /api/models to populate its options.
+
+Ollama models are only listed when they are actually routable
+(``config.ollama_routable`` — Ollama enabled *and* the proxy running),
+so the picker never offers a model that would fail on send.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from fastapi import APIRouter, Depends
+
+from nerve.config import get_config
+from nerve.gateway.auth import require_auth
+from nerve.ollama import discover_models
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.get("/api/models")
+async def list_models(user: dict = Depends(require_auth)):
+    """List selectable chat models for the composer's model picker.
+
+    Returns:
+        {
+          "default": "<anthropic model id>",
+          "models": [{"id", "provider"}...],
+          "ollama": {"enabled", "routable", "available"}
+        }
+
+    ``provider`` is ``"anthropic"`` or ``"ollama"``; the frontend formats
+    display labels. Discovery is best-effort — if the Ollama server is
+    unreachable the list simply contains no Ollama entries.
+    """
+    config = get_config()
+    default_model = config.agent.model
+
+    models: list[dict[str, str]] = [
+        {"id": default_model, "provider": "anthropic"},
+    ]
+
+    ollama_available = False
+    if config.ollama_routable:
+        # Discovery does blocking I/O (stdlib urllib) — keep the event loop free.
+        names = await asyncio.to_thread(discover_models, config.ollama.base_url)
+        ollama_available = bool(names)
+        models.extend({"id": name, "provider": "ollama"} for name in names)
+
+    return {
+        "default": default_model,
+        "models": models,
+        "ollama": {
+            "enabled": config.ollama.enabled,
+            "routable": config.ollama_routable,
+            "available": ollama_available,
+        },
+    }
diff --git a/nerve/gateway/server.py b/nerve/gateway/server.py
@@ -128,6 +128,13 @@ async def lifespan(app: FastAPI):
         except Exception as e:
             logger.error("CLIProxyAPI proxy failed to start: %s", e)
             raise
+    elif config.ollama.enabled:
+        # Ollama needs the proxy as its Anthropic↔OpenAI translation layer.
+        logger.warning(
+            "ollama.enabled is true but proxy.enabled is false — Ollama "
+            "models require the CLIProxyAPI proxy and will NOT be offered. "
+            "Set proxy.enabled: true to use local Ollama models.",
+        )
 
     # Initialize database
     db_path = Path("~/.nerve/nerve.db").expanduser()
@@ -580,6 +587,9 @@ async def ws_broadcast(session_id: str, message: dict):
                     user_text = data.get("content", "")
                     session_id = data.get("session_id", active_session)
                     file_ids = data.get("file_ids", [])
+                    # Optional per-message model override from the composer's
+                    # model picker (Anthropic default or a local Ollama model).
+                    selected_model = data.get("model") or None
 
                     if session_id != active_session:
                         # Switch sessions
@@ -603,6 +613,7 @@ async def ws_broadcast(session_id: str, message: dict):
                             user_message=user_text,
                             source="web",
                             channel="web",
+                            model=selected_model,
                             images=images or None,
                             image_refs=image_refs or None,
                         )

diff --git a/nerve/ollama.py b/nerve/ollama.py
@@ -0,0 +1,63 @@
+"""Local Ollama integration helpers.
+
+Ollama exposes an OpenAI-compatible API at ``/v1`` and a native API at
+``/api/*``. The Claude Agent SDK only speaks the Anthropic Messages API,
+so Ollama models are reached through the bundled CLIProxyAPI (registered
+as an ``openai-compatibility`` upstream). This module only handles model
+*discovery* — querying which models are installed on the local server so
+they can be offered in the model picker.
+
+Discovery is best-effort and never raises: if the Ollama server is down or
+unreachable, callers get an empty list and Ollama simply contributes no
+models to the picker.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import urllib.error
+import urllib.request
+
+logger = logging.getLogger(__name__)
+
+# Short, bounded timeout — discovery runs on the request path (model
+# picker) and on proxy-config writes; we never want it to hang the UI or
+# block startup if Ollama is installed-but-not-running.
+_DISCOVERY_TIMEOUT = 3.0
+
+
+def discover_models(base_url: str, timeout: float = _DISCOVERY_TIMEOUT) -> list[str]:
+    """Return model names installed on a local Ollama server.
+
+    Queries Ollama's native ``GET /api/tags`` endpoint. Returns a sorted,
+    de-duplicated list of model names, or an empty list (never raises) when
+    the server is unreachable or the response is malformed.
+
+    Uses the stdlib ``urllib`` so it is safe to call synchronously from a
+    worker thread (e.g. the proxy-config writer) without pulling in an
+    async HTTP client.
+
+    Args:
+        base_url: Native Ollama base URL, e.g. ``http://127.0.0.1:11434``.
+        timeout: Per-request timeout in seconds.
+    """
+    url = base_url.rstrip("/") + "/api/tags"
+    try:
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=timeout) as resp:  # noqa: S310
+            payload = json.loads(resp.read().decode("utf-8"))
+    except (urllib.error.URLError, OSError, ValueError, json.JSONDecodeError) as e:
+        logger.warning("Ollama model discovery failed at %s: %s", url, e)
+        return []
+
+    if not isinstance(payload, dict):
+        return []
+
+    names: set[str] = set()
+    for entry in payload.get("models") or []:
+        if isinstance(entry, dict):
+            name = entry.get("name")
+            if name:
+                names.add(str(name))
+    return sorted(names)