From 089b1b9146e51c9e065b3a7563ede6a3ba48b859 Mon Sep 17 00:00:00 2001
From: wachynaky <wachynaky@gmail.com>
Date: Wed, 24 Jun 2026 06:34:09 +0000
Subject: [PATCH] Add local Ollama model selection to the web composer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose locally-installed Ollama models as selectable chat models in the
composer's model picker. The Claude Agent SDK only speaks the Anthropic
Messages API, so Ollama (OpenAI-compatible) is reached through the bundled
CLIProxyAPI proxy, registered as an openai-compatibility upstream — this
requires proxy.enabled.

Backend:
- OllamaConfig + ollama_routable gate (Ollama enabled AND proxy enabled)
- nerve/ollama.py: best-effort model discovery via Ollama GET /api/tags
- proxy/service.py: register discovered models as a proxy upstream
- GET /api/models route for the picker
- engine: thread per-session model, recreate the SDK client on a
  mid-session model switch, and suppress Anthropic-only knobs (extended
  thinking, effort, context-1m beta) for non-Claude models
- server: pass the WS per-message model through to run()
- startup warning when ollama.enabled but proxy.enabled is false

Frontend:
- api.getModels() + optional model arg on ws.sendMessage
- chatStore holds available/selected/default model (persisted to localStorage)
- ChatInput renders a model picker, shown only when more than one model
  is offered

config.example.yaml: document the proxy and ollama blocks.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 config.example.yaml                   | 19 ++++++++
 nerve/agent/engine.py                 | 44 +++++++++++++++---
 nerve/config.py                       | 53 ++++++++++++++++++++++
 nerve/gateway/routes/__init__.py      |  2 +
 nerve/gateway/routes/models.py        | 65 +++++++++++++++++++++++++++
 nerve/gateway/server.py               | 11 +++++
 nerve/ollama.py                       | 63 ++++++++++++++++++++++++++
 nerve/proxy/service.py                | 43 ++++++++++++++++++
 web/src/api/client.ts                 |  9 ++++
 web/src/api/websocket.ts              |  7 ++-
 web/src/components/Chat/ChatInput.tsx | 39 ++++++++++++++++
 web/src/stores/chatStore.ts           | 38 +++++++++++++++-
 12 files changed, 384 insertions(+), 9 deletions(-)
 create mode 100644 nerve/gateway/routes/models.py
 create mode 100644 nerve/ollama.py

diff --git a/config.example.yaml b/config.example.yaml
index df519d0..65af4ed 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -11,6 +11,25 @@ timezone: America/New_York
 #   aws_region: us-east-1      # AWS region for Bedrock
 #   # aws_profile: ""          # Optional: AWS SSO profile name
 
+# Local proxy (CLIProxyAPI) — optional. Routes Anthropic API calls through
+# Claude Code OAuth, and is the Anthropic↔OpenAI translation layer that local
+# Ollama models are reached through. Required for the ollama block below.
+# proxy:
+#   enabled: false
+#   port: 8317
+#   host: 127.0.0.1
+
+# Local Ollama — expose locally-installed Ollama models as selectable chat
+# models in the web composer's model picker. Ollama speaks an OpenAI-compatible
+# API, so this requires proxy.enabled: true (the proxy translates the
+# Anthropic requests the SDK emits). Models are auto-discovered at runtime
+# from the running Ollama server (GET /api/tags) — whatever you've pulled
+# locally shows up automatically, no need to list models here.
+# ollama:
+#   enabled: false
+#   host: 127.0.0.1
+#   port: 11434
+
 # Agent
 agent:
   model: claude-opus-4-8         # Primary model for conversations
diff --git a/nerve/agent/engine.py b/nerve/agent/engine.py
index 01b11ae..b23172b 100644
--- a/nerve/agent/engine.py
+++ b/nerve/agent/engine.py
@@ -327,6 +327,10 @@ def __init__(self, config: NerveConfig, db: Database):
         # Read by session-scoped tools (send_file) to avoid dispatching via
         # stale router context from a prior inbound channel.
         self._active_channel: dict[str, str] = {}
+        # Resolved model bound to each session's live SDK client. Used to
+        # detect mid-session model switches (the CLI fixes its model at
+        # connect time, so a change requires recreating the client).
+        self._session_models: dict[str, str] = {}
         self._router = None  # ChannelRouter — lazy-initialized via .router property
         self._mcp_servers_cache = list(config.mcp_servers)  # hot-reloadable
         self._claude_code_plugins: list[dict[str, str]] = []  # plugin dirs
@@ -973,19 +977,28 @@ def _build_options(
         else:
             system_prompt = system_prompt_str
 
-        thinking_config = self._parse_thinking_config(
-            self.config.agent.thinking,
-            model or self.config.agent.model,
+        # Local Ollama models are reached through the proxy and speak the
+        # OpenAI-translated API — Anthropic-only knobs (extended thinking,
+        # effort, the context-1m beta) don't apply and may break translation,
+        # so suppress them for non-Claude models.
+        selected_model = model or self.config.agent.model
+        is_ollama_model = (
+            self.config.ollama.enabled and "claude" not in selected_model.lower()
         )
-        effort = self._effective_effort(
-            self.config.agent.effort,
-            model or self.config.agent.model,
+
+        thinking_config = (
+            None if is_ollama_model
+            else self._parse_thinking_config(self.config.agent.thinking, selected_model)
+        )
+        effort = (
+            None if is_ollama_model
+            else self._effective_effort(self.config.agent.effort, selected_model)
         )
         # Some subscriptions reject the context-1m beta for specific models
         # (e.g. claude-sonnet-4-6) — skip the beta header for those.
         betas = (
             ["context-1m-2025-08-07"]
-            if self.config.agent.context_1m_enabled_for(model)
+            if not is_ollama_model and self.config.agent.context_1m_enabled_for(model)
             else []
         )
 
@@ -1468,7 +1481,9 @@ async def _get_or_create_client(
         lock = self.sessions.get_lock(session_id)
         async with lock:
             client = self.sessions.get_client(session_id)
+            requested_model = model or self.config.agent.model
             if client is not None:
+                bound_model = self._session_models.get(session_id)
                 # Health check: verify the underlying CLI process is still alive
                 if self._is_client_dead(client):
                     logger.warning(
@@ -1480,6 +1495,20 @@ async def _get_or_create_client(
                     unregister_handler(session_id)
                     await self._safe_disconnect(client)
                     client = None
+                elif bound_model is not None and bound_model != requested_model:
+                    # Model switched mid-session (e.g. the composer's picker
+                    # moved from the Anthropic default to a local Ollama
+                    # model). The CLI binds its model at connect time, so
+                    # tear the client down and recreate it below.
+                    logger.info(
+                        "Session %s model changed (%s → %s), recreating client",
+                        session_id, bound_model, requested_model,
+                    )
+                    self._stop_idle_watcher(session_id)
+                    self.sessions.remove_client(session_id)
+                    unregister_handler(session_id)
+                    await self._safe_disconnect(client)
+                    client = None
                 else:
                     return client
 
@@ -1568,6 +1597,7 @@ async def _get_or_create_client(
 
             # Record connected_at and the resolved model
             resolved_model = options.model
+            self._session_models[session_id] = resolved_model
             now = datetime.now(timezone.utc).isoformat()
             connected_at = session.get("connected_at") if session and sdk_resume_id else now
             await self.sessions.mark_active(
diff --git a/nerve/config.py b/nerve/config.py
index a3bd7ce..e3e3b14 100644
--- a/nerve/config.py
+++ b/nerve/config.py
@@ -717,6 +717,48 @@ def from_dict(cls, d: dict) -> ProxyConfig:
         )
 
 
+@dataclass
+class OllamaConfig:
+    """Local Ollama server — exposes its models as selectable chat models.
+
+    Ollama speaks an OpenAI-compatible API (``/v1``), not the Anthropic
+    Messages API the Claude Agent SDK uses. So Ollama models are routed
+    through the bundled CLIProxyAPI, which translates Anthropic ↔ OpenAI
+    and is registered with Ollama as an ``openai-compatibility`` upstream.
+
+    Requirement: this only takes effect when the proxy is also enabled
+    (``proxy.enabled: true``) — the proxy is the translation layer. When
+    ``enabled`` is true but the proxy is off, Ollama models are not offered
+    (a warning is logged at startup).
+
+    Models are auto-discovered at runtime from Ollama's native
+    ``GET /api/tags`` endpoint, so whatever you have pulled locally shows
+    up in the model picker with no extra config.
+    """
+
+    enabled: bool = False
+    host: str = "127.0.0.1"
+    port: int = 11434
+
+    @property
+    def base_url(self) -> str:
+        """Native Ollama base URL (used for ``/api/tags`` discovery)."""
+        return f"http://{self.host}:{self.port}"
+
+    @property
+    def openai_base_url(self) -> str:
+        """OpenAI-compatible base URL (registered as a proxy upstream)."""
+        return f"http://{self.host}:{self.port}/v1"
+
+    @classmethod
+    def from_dict(cls, d: dict) -> OllamaConfig:
+        return cls(
+            enabled=bool(d.get("enabled", False)),
+            host=d.get("host", "127.0.0.1"),
+            port=int(d.get("port", 11434)),
+        )
+
+
 @dataclass
 class McpEndpointConfig:
     """Nerve's own MCP server endpoint (Nerve-as-MCP-server).
@@ -1063,6 +1105,7 @@ class NerveConfig:
     notifications: NotificationsConfig = field(default_factory=NotificationsConfig)
     docker: DockerConfig = field(default_factory=DockerConfig)
     proxy: ProxyConfig = field(default_factory=ProxyConfig)
+    ollama: OllamaConfig = field(default_factory=OllamaConfig)
     houseofagents: HouseOfAgentsConfig = field(default_factory=HouseOfAgentsConfig)
     langfuse: LangfuseConfig = field(default_factory=LangfuseConfig)
     xmemory: XmemoryConfig = field(default_factory=XmemoryConfig)
@@ -1098,6 +1141,15 @@ def effective_api_key(self) -> str:
             return self.proxy.api_key
         return self.anthropic_api_key
 
+    @property
+    def ollama_routable(self) -> bool:
+        """True when Ollama models can actually be served.
+
+        Requires both Ollama enabled and the proxy running (the proxy is
+        the Anthropic↔OpenAI translation layer Ollama is reached through).
+        """
+        return self.ollama.enabled and self.proxy.enabled
+
     def create_anthropic_client(self, timeout: float = 60.0) -> Any:
         """Create an Anthropic client based on the configured provider.
 
@@ -1180,6 +1232,7 @@ def from_dict(cls, d: dict) -> NerveConfig:
             notifications=NotificationsConfig.from_dict(d.get("notifications", {})),
             docker=DockerConfig.from_dict(d.get("docker", {})),
             proxy=ProxyConfig.from_dict(d.get("proxy", {})),
+            ollama=OllamaConfig.from_dict(d.get("ollama", {})),
             houseofagents=HouseOfAgentsConfig.from_dict(d.get("houseofagents", {})),
             langfuse=LangfuseConfig.from_dict(d.get("langfuse", {})),
             xmemory=XmemoryConfig.from_dict(d.get("xmemory", {})),
diff --git a/nerve/gateway/routes/__init__.py b/nerve/gateway/routes/__init__.py
index 6516731..fda249d 100644
--- a/nerve/gateway/routes/__init__.py
+++ b/nerve/gateway/routes/__init__.py
@@ -32,6 +32,7 @@
     files,
     external_agents,
     prompt_rewrite,
+    models,
 )
 
 __all__ = [
@@ -61,4 +62,5 @@ def register_all_routes() -> APIRouter:
     router.include_router(files.router)
     router.include_router(external_agents.router)
     router.include_router(prompt_rewrite.router)
+    router.include_router(models.router)
     return router
diff --git a/nerve/gateway/routes/models.py b/nerve/gateway/routes/models.py
new file mode 100644
index 0000000..dcc9b92
--- /dev/null
+++ b/nerve/gateway/routes/models.py
@@ -0,0 +1,65 @@
+"""Model discovery routes — which chat models the UI can offer.
+
+Exposes the configured Anthropic chat model plus any locally-installed
+Ollama models (auto-discovered from the running Ollama server). The web
+composer's model picker calls GET /api/models to populate its options.
+
+Ollama models are only listed when they are actually routable
+(``config.ollama_routable`` — Ollama enabled *and* the proxy running),
+so the picker never offers a model that would fail on send.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from fastapi import APIRouter, Depends
+
+from nerve.config import get_config
+from nerve.gateway.auth import require_auth
+from nerve.ollama import discover_models
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.get("/api/models")
+async def list_models(user: dict = Depends(require_auth)):
+    """List selectable chat models for the composer's model picker.
+
+    Returns:
+        {
+          "default": "<anthropic model id>",
+          "models": [{"id", "provider"}...],
+          "ollama": {"enabled", "routable", "available"}
+        }
+
+    ``provider`` is ``"anthropic"`` or ``"ollama"``; the frontend formats
+    display labels. Discovery is best-effort — if the Ollama server is
+    unreachable the list simply contains no Ollama entries.
+    """
+    config = get_config()
+    default_model = config.agent.model
+
+    models: list[dict[str, str]] = [
+        {"id": default_model, "provider": "anthropic"},
+    ]
+
+    ollama_available = False
+    if config.ollama_routable:
+        # Discovery does blocking I/O (stdlib urllib) — keep the event loop free.
+        names = await asyncio.to_thread(discover_models, config.ollama.base_url)
+        ollama_available = bool(names)
+        models.extend({"id": name, "provider": "ollama"} for name in names)
+
+    return {
+        "default": default_model,
+        "models": models,
+        "ollama": {
+            "enabled": config.ollama.enabled,
+            "routable": config.ollama_routable,
+            "available": ollama_available,
+        },
+    }
diff --git a/nerve/gateway/server.py b/nerve/gateway/server.py
index f0ba694..b9e27f1 100644
--- a/nerve/gateway/server.py
+++ b/nerve/gateway/server.py
@@ -128,6 +128,13 @@ async def lifespan(app: FastAPI):
         except Exception as e:
             logger.error("CLIProxyAPI proxy failed to start: %s", e)
             raise
+    elif config.ollama.enabled:
+        # Ollama needs the proxy as its Anthropic↔OpenAI translation layer.
+        logger.warning(
+            "ollama.enabled is true but proxy.enabled is false — Ollama "
+            "models require the CLIProxyAPI proxy and will NOT be offered. "
+            "Set proxy.enabled: true to use local Ollama models.",
+        )
 
     # Initialize database
     db_path = Path("~/.nerve/nerve.db").expanduser()
@@ -580,6 +587,9 @@ async def ws_broadcast(session_id: str, message: dict):
                     user_text = data.get("content", "")
                     session_id = data.get("session_id", active_session)
                     file_ids = data.get("file_ids", [])
+                    # Optional per-message model override from the composer's
+                    # model picker (Anthropic default or a local Ollama model).
+                    selected_model = data.get("model") or None
 
                     if session_id != active_session:
                         # Switch sessions
@@ -603,6 +613,7 @@ async def ws_broadcast(session_id: str, message: dict):
                             user_message=user_text,
                             source="web",
                             channel="web",
+                            model=selected_model,
                             images=images or None,
                             image_refs=image_refs or None,
                         )
diff --git a/nerve/ollama.py b/nerve/ollama.py
new file mode 100644
index 0000000..ece8c7d
--- /dev/null
+++ b/nerve/ollama.py
@@ -0,0 +1,63 @@
+"""Local Ollama integration helpers.
+
+Ollama exposes an OpenAI-compatible API at ``/v1`` and a native API at
+``/api/*``. The Claude Agent SDK only speaks the Anthropic Messages API,
+so Ollama models are reached through the bundled CLIProxyAPI (registered
+as an ``openai-compatibility`` upstream). This module only handles model
+*discovery* — querying which models are installed on the local server so
+they can be offered in the model picker.
+
+Discovery is best-effort and never raises: if the Ollama server is down or
+unreachable, callers get an empty list and Ollama simply contributes no
+models to the picker.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import urllib.error
+import urllib.request
+
+logger = logging.getLogger(__name__)
+
+# Short, bounded timeout — discovery runs on the request path (model
+# picker) and on proxy-config writes; we never want it to hang the UI or
+# block startup if Ollama is installed-but-not-running.
+_DISCOVERY_TIMEOUT = 3.0
+
+
+def discover_models(base_url: str, timeout: float = _DISCOVERY_TIMEOUT) -> list[str]:
+    """Return model names installed on a local Ollama server.
+
+    Queries Ollama's native ``GET /api/tags`` endpoint. Returns a sorted,
+    de-duplicated list of model names, or an empty list (never raises) when
+    the server is unreachable or the response is malformed.
+
+    Uses the stdlib ``urllib`` so it is safe to call synchronously from a
+    worker thread (e.g. the proxy-config writer) without pulling in an
+    async HTTP client.
+
+    Args:
+        base_url: Native Ollama base URL, e.g. ``http://127.0.0.1:11434``.
+        timeout: Per-request timeout in seconds.
+    """
+    url = base_url.rstrip("/") + "/api/tags"
+    try:
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=timeout) as resp:  # noqa: S310
+            payload = json.loads(resp.read().decode("utf-8"))
+    except (urllib.error.URLError, OSError, ValueError, json.JSONDecodeError) as e:
+        logger.warning("Ollama model discovery failed at %s: %s", url, e)
+        return []
+
+    if not isinstance(payload, dict):
+        return []
+
+    names: set[str] = set()
+    for entry in payload.get("models") or []:
+        if isinstance(entry, dict):
+            name = entry.get("name")
+            if name:
+                names.add(str(name))
+    return sorted(names)
diff --git a/nerve/proxy/service.py b/nerve/proxy/service.py
index 8d32cf4..1e57ade 100644
--- a/nerve/proxy/service.py
+++ b/nerve/proxy/service.py
@@ -151,12 +151,55 @@ def _write_proxy_config(self) -> Path:
             "request-retry": 3,
         }
 
+        # Register a local Ollama server as an OpenAI-compatible upstream so
+        # its models become selectable. CLIProxyAPI translates the Anthropic
+        # requests the SDK emits into OpenAI calls against Ollama's /v1 API.
+        ollama_provider = self._build_ollama_provider()
+        if ollama_provider is not None:
+            proxy_cfg["openai-compatibility"] = [ollama_provider]
+
         self._config_path.parent.mkdir(parents=True, exist_ok=True)
         with open(self._config_path, "w") as f:
             yaml.safe_dump(proxy_cfg, f, default_flow_style=False, sort_keys=False)
 
         return self._config_path
 
+    def _build_ollama_provider(self) -> dict[str, Any] | None:
+        """Build the CLIProxyAPI ``openai-compatibility`` entry for Ollama.
+
+        Returns ``None`` when Ollama is disabled or no models are installed.
+        Models are auto-discovered from Ollama's ``/api/tags`` so the picker
+        reflects whatever is pulled locally. Each model is exposed under its
+        own name as the alias the client selects.
+        """
+        ollama = self.config.ollama
+        if not ollama.enabled:
+            return None
+
+        from nerve.ollama import discover_models
+
+        models = discover_models(ollama.base_url)
+        if not models:
+            logger.warning(
+                "Ollama enabled but no models discovered at %s — the local "
+                "server may be down or have no models pulled. Skipping the "
+                "Ollama proxy upstream.",
+                ollama.base_url,
+            )
+            return None
+
+        logger.info(
+            "Registering Ollama upstream (%s) with %d model(s): %s",
+            ollama.openai_base_url, len(models), ", ".join(models),
+        )
+        return {
+            "name": "ollama",
+            "base-url": ollama.openai_base_url,
+            # Ollama ignores the API key, but CLIProxyAPI requires an entry.
+            "api-key-entries": [{"api-key": "ollama"}],
+            "models": [{"name": m, "alias": m} for m in models],
+        }
+
     # ------------------------------------------------------------------ #
     #  Lifecycle                                                          #
     # ------------------------------------------------------------------ #
diff --git a/web/src/api/client.ts b/web/src/api/client.ts
index 58f5138..29302ab 100644
--- a/web/src/api/client.ts
+++ b/web/src/api/client.ts
@@ -63,6 +63,15 @@ export const api = {
 
   authStatus: () => request<{ auth_required: boolean }>('/auth/status'),
 
+  // Models — chat models offered to the composer's picker (Anthropic default
+  // plus any locally-installed Ollama models, auto-discovered server-side).
+  getModels: () =>
+    request<{
+      default: string;
+      models: { id: string; provider: string }[];
+      ollama: { enabled: boolean; routable: boolean; available: boolean };
+    }>('/models'),
+
   // Sessions
   listSessions: () => request<{ sessions: any[] }>('/sessions'),
   searchSessions: (q: string) =>
diff --git a/web/src/api/websocket.ts b/web/src/api/websocket.ts
index 9765b50..f0877b6 100644
--- a/web/src/api/websocket.ts
+++ b/web/src/api/websocket.ts
@@ -126,11 +126,16 @@ export class NerveWebSocket {
     return 'dropped';
   }
 
-  sendMessage(content: string, sessionId: string, fileIds?: string[]): SendStatus {
+  sendMessage(content: string, sessionId: string, fileIds?: string[], model?: string): SendStatus {
     const msg: Record<string, unknown> = { type: 'message', content, session_id: sessionId };
     if (fileIds && fileIds.length > 0) {
       msg.file_ids = fileIds;
     }
+    // Per-message model override from the composer's picker (omitted → server
+    // uses the configured default). May be an Anthropic id or an Ollama model.
+    if (model) {
+      msg.model = model;
+    }
     return this.send(msg);
   }
 
diff --git a/web/src/components/Chat/ChatInput.tsx b/web/src/components/Chat/ChatInput.tsx
index 33733ff..73d3b71 100644
--- a/web/src/components/Chat/ChatInput.tsx
+++ b/web/src/components/Chat/ChatInput.tsx
@@ -59,6 +59,13 @@ export function ChatInput({ onSend, onStop, isStreaming, disabled }: {
   const activeSession = useChatStore(s => s.activeSession);
   const isNewChat = useChatStore(s => s.messages.length === 0);
 
+  // ── Model picker ──
+  const availableModels = useChatStore(s => s.availableModels);
+  const selectedModel = useChatStore(s => s.selectedModel);
+  const modelsDefault = useChatStore(s => s.modelsDefault);
+  const setSelectedModel = useChatStore(s => s.setSelectedModel);
+  const loadModels = useChatStore(s => s.loadModels);
+
   const [prevQuoteCount, setPrevQuoteCount] = useState(0);
 
   // ── Prompt rewrite ──
@@ -77,6 +84,10 @@ export function ChatInput({ onSend, onStop, isStreaming, disabled }: {
       .catch(() => setRewriteAvailable(false));
   }, []);
 
+  // Load selectable models once — the picker only renders when more than the
+  // default model is offered (i.e. local Ollama models are configured).
+  useEffect(() => { loadModels(); }, [loadModels]);
+
   useEffect(() => {
     localStorage.setItem(REWRITE_PREF_KEY, rewriteEnabled ? '1' : '0');
   }, [rewriteEnabled]);
@@ -437,6 +448,34 @@ export function ChatInput({ onSend, onStop, isStreaming, disabled }: {
               <Sparkles size={18} />
             </button>
           )}
+          {/* Model picker — only when more than one model is offered (local
+              Ollama models configured + available). Hidden otherwise so the
+              composer is unchanged for the default single-model setup. */}
+          {availableModels.length > 1 && (
+            <select
+              value={selectedModel ?? modelsDefault ?? ''}
+              onChange={(e) => setSelectedModel(e.target.value === modelsDefault ? null : e.target.value)}
+              disabled={disabled || isStreaming || rewriteActive}
+              title="Model for your next message"
+              className="h-10 max-w-[170px] px-2.5 bg-surface-raised border border-border rounded-xl text-[13px] text-text-secondary outline-none focus:border-accent/50 cursor-pointer shrink-0 disabled:opacity-30 truncate"
+            >
+              {availableModels.some(m => m.provider === 'anthropic') && (
+                <optgroup label="Anthropic">
+                  {availableModels.filter(m => m.provider === 'anthropic').map(m => (
+                    <option key={m.id} value={m.id}>{m.id}</option>
+                  ))}
+                </optgroup>
+              )}
+              {availableModels.some(m => m.provider === 'ollama') && (
+                <optgroup label="Ollama (local)">
+                  {availableModels.filter(m => m.provider === 'ollama').map(m => (
+                    <option key={m.id} value={m.id}>{m.id}</option>
+                  ))}
+                </optgroup>
+              )}
+            </select>
+          )}
+
           <input
             ref={fileInputRef}
             type="file"
diff --git a/web/src/stores/chatStore.ts b/web/src/stores/chatStore.ts
index 7fb6751..b398753 100644
--- a/web/src/stores/chatStore.ts
+++ b/web/src/stores/chatStore.ts
@@ -130,6 +130,12 @@ interface ChatState {
   searchLoading: boolean;
   /** Bumped whenever something wants the sidebar search input focused (e.g. Cmd+K). */
   searchFocusNonce: number;
+  // Composer model picker: options from GET /api/models (Anthropic default +
+  // locally-installed Ollama models), the server's default id, and the user's
+  // current pick (null = use the server default).
+  availableModels: { id: string; provider: string }[];
+  modelsDefault: string | null;
+  selectedModel: string | null;
 
   loadSessions: () => Promise<void>;
   switchSession: (id: string) => Promise<void>;
@@ -144,6 +150,10 @@ interface ChatState {
   /** Trigger the sidebar to mount + focus the search input (used by Cmd+K). */
   requestSearchFocus: () => void;
   sendMessage: (content: string) => void;
+  /** Fetch selectable models for the composer picker (GET /api/models). */
+  loadModels: () => Promise<void>;
+  /** Set the model for the next message (null → server default). */
+  setSelectedModel: (model: string | null) => void;
   stopSession: () => void;
   handleWSMessage: (msg: WSMessage) => void;
   addQuote: (text: string, action: QuoteAction) => void;
@@ -194,6 +204,9 @@ export const useChatStore = create<ChatState>((set, get) => ({
   searchResults: null,
   searchLoading: false,
   searchFocusNonce: 0,
+  availableModels: [],
+  modelsDefault: null,
+  selectedModel: localStorage.getItem('nerve_selected_model') || null,
 
   addQuote: (text: string, action: QuoteAction) => {
     const id = `q${++_quoteId}`;
@@ -536,6 +549,29 @@ export const useChatStore = create<ChatState>((set, get) => ({
     set(s => ({ searchFocusNonce: s.searchFocusNonce + 1 }));
   },
 
+  loadModels: async () => {
+    try {
+      const res = await api.getModels();
+      set((state) => {
+        // Drop a stale pick (e.g. an Ollama model no longer installed) so we
+        // never send a model the server can't route.
+        const ids = new Set(res.models.map(m => m.id));
+        const keep = state.selectedModel && ids.has(state.selectedModel)
+          ? state.selectedModel : null;
+        if (keep !== state.selectedModel) localStorage.removeItem('nerve_selected_model');
+        return { availableModels: res.models, modelsDefault: res.default, selectedModel: keep };
+      });
+    } catch (e) {
+      console.error('Failed to load models:', e);
+    }
+  },
+
+  setSelectedModel: (model: string | null) => {
+    if (model) localStorage.setItem('nerve_selected_model', model);
+    else localStorage.removeItem('nerve_selected_model');
+    set({ selectedModel: model });
+  },
+
   sendMessage: async (content: string, fileIds?: string[], imageBlocks?: Array<{ url: string; filename: string; media_type: string }>) => {
     let session = get().activeSession;
     const blocks: import('../types/chat').MessageBlock[] = [];
@@ -592,7 +628,7 @@ export const useChatStore = create<ChatState>((set, get) => ({
         return;
       }
     }
-    const status = ws.sendMessage(content, session, fileIds);
+    const status = ws.sendMessage(content, session, fileIds, get().selectedModel ?? undefined);
     if (status === 'dropped') {
       // The message could not reach the server. Revert the optimistic
       // state and surface the failure inline so the user knows to retry.