From 089b1b9146e51c9e065b3a7563ede6a3ba48b859 Mon Sep 17 00:00:00 2001 From: wachynaky Date: Wed, 24 Jun 2026 06:34:09 +0000 Subject: [PATCH] Add local Ollama model selection to the web composer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose locally-installed Ollama models as selectable chat models in the composer's model picker. The Claude Agent SDK only speaks the Anthropic Messages API, so Ollama (OpenAI-compatible) is reached through the bundled CLIProxyAPI proxy, registered as an openai-compatibility upstream — this requires proxy.enabled. Backend: - OllamaConfig + ollama_routable gate (Ollama enabled AND proxy enabled) - nerve/ollama.py: best-effort model discovery via Ollama GET /api/tags - proxy/service.py: register discovered models as a proxy upstream - GET /api/models route for the picker - engine: thread per-session model, recreate the SDK client on a mid-session model switch, and suppress Anthropic-only knobs (extended thinking, effort, context-1m beta) for non-Claude models - server: pass the WS per-message model through to run() - startup warning when ollama.enabled but proxy.enabled is false Frontend: - api.getModels() + optional model arg on ws.sendMessage - chatStore holds available/selected/default model (persisted to localStorage) - ChatInput renders a model picker, shown only when more than one model is offered config.example.yaml: document the proxy and ollama blocks. Co-Authored-By: Claude Opus 4.8 --- config.example.yaml | 19 ++++++++ nerve/agent/engine.py | 44 +++++++++++++++--- nerve/config.py | 53 ++++++++++++++++++++++ nerve/gateway/routes/__init__.py | 2 + nerve/gateway/routes/models.py | 65 +++++++++++++++++++++++++++ nerve/gateway/server.py | 11 +++++ nerve/ollama.py | 63 ++++++++++++++++++++++++++ nerve/proxy/service.py | 43 ++++++++++++++++++ web/src/api/client.ts | 9 ++++ web/src/api/websocket.ts | 7 ++- web/src/components/Chat/ChatInput.tsx | 39 ++++++++++++++++ web/src/stores/chatStore.ts | 38 +++++++++++++++- 12 files changed, 384 insertions(+), 9 deletions(-) create mode 100644 nerve/gateway/routes/models.py create mode 100644 nerve/ollama.py diff --git a/config.example.yaml b/config.example.yaml index df519d0..65af4ed 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -11,6 +11,25 @@ timezone: America/New_York # aws_region: us-east-1 # AWS region for Bedrock # # aws_profile: "" # Optional: AWS SSO profile name +# Local proxy (CLIProxyAPI) — optional. Routes Anthropic API calls through +# Claude Code OAuth, and is the Anthropic↔OpenAI translation layer that local +# Ollama models are reached through. Required for the ollama block below. +# proxy: +# enabled: false +# port: 8317 +# host: 127.0.0.1 + +# Local Ollama — expose locally-installed Ollama models as selectable chat +# models in the web composer's model picker. Ollama speaks an OpenAI-compatible +# API, so this requires proxy.enabled: true (the proxy translates the +# Anthropic requests the SDK emits). Models are auto-discovered at runtime +# from the running Ollama server (GET /api/tags) — whatever you've pulled +# locally shows up automatically, no need to list models here. +# ollama: +# enabled: false +# host: 127.0.0.1 +# port: 11434 + # Agent agent: model: claude-opus-4-8 # Primary model for conversations diff --git a/nerve/agent/engine.py b/nerve/agent/engine.py index 01b11ae..b23172b 100644 --- a/nerve/agent/engine.py +++ b/nerve/agent/engine.py @@ -327,6 +327,10 @@ def __init__(self, config: NerveConfig, db: Database): # Read by session-scoped tools (send_file) to avoid dispatching via # stale router context from a prior inbound channel. self._active_channel: dict[str, str] = {} + # Resolved model bound to each session's live SDK client. Used to + # detect mid-session model switches (the CLI fixes its model at + # connect time, so a change requires recreating the client). + self._session_models: dict[str, str] = {} self._router = None # ChannelRouter — lazy-initialized via .router property self._mcp_servers_cache = list(config.mcp_servers) # hot-reloadable self._claude_code_plugins: list[dict[str, str]] = [] # plugin dirs @@ -973,19 +977,28 @@ def _build_options( else: system_prompt = system_prompt_str - thinking_config = self._parse_thinking_config( - self.config.agent.thinking, - model or self.config.agent.model, + # Local Ollama models are reached through the proxy and speak the + # OpenAI-translated API — Anthropic-only knobs (extended thinking, + # effort, the context-1m beta) don't apply and may break translation, + # so suppress them for non-Claude models. + selected_model = model or self.config.agent.model + is_ollama_model = ( + self.config.ollama.enabled and "claude" not in selected_model.lower() ) - effort = self._effective_effort( - self.config.agent.effort, - model or self.config.agent.model, + + thinking_config = ( + None if is_ollama_model + else self._parse_thinking_config(self.config.agent.thinking, selected_model) + ) + effort = ( + None if is_ollama_model + else self._effective_effort(self.config.agent.effort, selected_model) ) # Some subscriptions reject the context-1m beta for specific models # (e.g. claude-sonnet-4-6) — skip the beta header for those. betas = ( ["context-1m-2025-08-07"] - if self.config.agent.context_1m_enabled_for(model) + if not is_ollama_model and self.config.agent.context_1m_enabled_for(model) else [] ) @@ -1468,7 +1481,9 @@ async def _get_or_create_client( lock = self.sessions.get_lock(session_id) async with lock: client = self.sessions.get_client(session_id) + requested_model = model or self.config.agent.model if client is not None: + bound_model = self._session_models.get(session_id) # Health check: verify the underlying CLI process is still alive if self._is_client_dead(client): logger.warning( @@ -1480,6 +1495,20 @@ async def _get_or_create_client( unregister_handler(session_id) await self._safe_disconnect(client) client = None + elif bound_model is not None and bound_model != requested_model: + # Model switched mid-session (e.g. the composer's picker + # moved from the Anthropic default to a local Ollama + # model). The CLI binds its model at connect time, so + # tear the client down and recreate it below. + logger.info( + "Session %s model changed (%s → %s), recreating client", + session_id, bound_model, requested_model, + ) + self._stop_idle_watcher(session_id) + self.sessions.remove_client(session_id) + unregister_handler(session_id) + await self._safe_disconnect(client) + client = None else: return client @@ -1568,6 +1597,7 @@ async def _get_or_create_client( # Record connected_at and the resolved model resolved_model = options.model + self._session_models[session_id] = resolved_model now = datetime.now(timezone.utc).isoformat() connected_at = session.get("connected_at") if session and sdk_resume_id else now await self.sessions.mark_active( diff --git a/nerve/config.py b/nerve/config.py index a3bd7ce..e3e3b14 100644 --- a/nerve/config.py +++ b/nerve/config.py @@ -717,6 +717,48 @@ def from_dict(cls, d: dict) -> ProxyConfig: ) +@dataclass +class OllamaConfig: + """Local Ollama server — exposes its models as selectable chat models. + + Ollama speaks an OpenAI-compatible API (``/v1``), not the Anthropic + Messages API the Claude Agent SDK uses. So Ollama models are routed + through the bundled CLIProxyAPI, which translates Anthropic ↔ OpenAI + and is registered with Ollama as an ``openai-compatibility`` upstream. + + Requirement: this only takes effect when the proxy is also enabled + (``proxy.enabled: true``) — the proxy is the translation layer. When + ``enabled`` is true but the proxy is off, Ollama models are not offered + (a warning is logged at startup). + + Models are auto-discovered at runtime from Ollama's native + ``GET /api/tags`` endpoint, so whatever you have pulled locally shows + up in the model picker with no extra config. + """ + + enabled: bool = False + host: str = "127.0.0.1" + port: int = 11434 + + @property + def base_url(self) -> str: + """Native Ollama base URL (used for ``/api/tags`` discovery).""" + return f"http://{self.host}:{self.port}" + + @property + def openai_base_url(self) -> str: + """OpenAI-compatible base URL (registered as a proxy upstream).""" + return f"http://{self.host}:{self.port}/v1" + + @classmethod + def from_dict(cls, d: dict) -> OllamaConfig: + return cls( + enabled=bool(d.get("enabled", False)), + host=d.get("host", "127.0.0.1"), + port=int(d.get("port", 11434)), + ) + + @dataclass class McpEndpointConfig: """Nerve's own MCP server endpoint (Nerve-as-MCP-server). @@ -1063,6 +1105,7 @@ class NerveConfig: notifications: NotificationsConfig = field(default_factory=NotificationsConfig) docker: DockerConfig = field(default_factory=DockerConfig) proxy: ProxyConfig = field(default_factory=ProxyConfig) + ollama: OllamaConfig = field(default_factory=OllamaConfig) houseofagents: HouseOfAgentsConfig = field(default_factory=HouseOfAgentsConfig) langfuse: LangfuseConfig = field(default_factory=LangfuseConfig) xmemory: XmemoryConfig = field(default_factory=XmemoryConfig) @@ -1098,6 +1141,15 @@ def effective_api_key(self) -> str: return self.proxy.api_key return self.anthropic_api_key + @property + def ollama_routable(self) -> bool: + """True when Ollama models can actually be served. + + Requires both Ollama enabled and the proxy running (the proxy is + the Anthropic↔OpenAI translation layer Ollama is reached through). + """ + return self.ollama.enabled and self.proxy.enabled + def create_anthropic_client(self, timeout: float = 60.0) -> Any: """Create an Anthropic client based on the configured provider. @@ -1180,6 +1232,7 @@ def from_dict(cls, d: dict) -> NerveConfig: notifications=NotificationsConfig.from_dict(d.get("notifications", {})), docker=DockerConfig.from_dict(d.get("docker", {})), proxy=ProxyConfig.from_dict(d.get("proxy", {})), + ollama=OllamaConfig.from_dict(d.get("ollama", {})), houseofagents=HouseOfAgentsConfig.from_dict(d.get("houseofagents", {})), langfuse=LangfuseConfig.from_dict(d.get("langfuse", {})), xmemory=XmemoryConfig.from_dict(d.get("xmemory", {})), diff --git a/nerve/gateway/routes/__init__.py b/nerve/gateway/routes/__init__.py index 6516731..fda249d 100644 --- a/nerve/gateway/routes/__init__.py +++ b/nerve/gateway/routes/__init__.py @@ -32,6 +32,7 @@ files, external_agents, prompt_rewrite, + models, ) __all__ = [ @@ -61,4 +62,5 @@ def register_all_routes() -> APIRouter: router.include_router(files.router) router.include_router(external_agents.router) router.include_router(prompt_rewrite.router) + router.include_router(models.router) return router diff --git a/nerve/gateway/routes/models.py b/nerve/gateway/routes/models.py new file mode 100644 index 0000000..dcc9b92 --- /dev/null +++ b/nerve/gateway/routes/models.py @@ -0,0 +1,65 @@ +"""Model discovery routes — which chat models the UI can offer. + +Exposes the configured Anthropic chat model plus any locally-installed +Ollama models (auto-discovered from the running Ollama server). The web +composer's model picker calls GET /api/models to populate its options. + +Ollama models are only listed when they are actually routable +(``config.ollama_routable`` — Ollama enabled *and* the proxy running), +so the picker never offers a model that would fail on send. +""" + +from __future__ import annotations + +import asyncio +import logging + +from fastapi import APIRouter, Depends + +from nerve.config import get_config +from nerve.gateway.auth import require_auth +from nerve.ollama import discover_models + +logger = logging.getLogger(__name__) + +router = APIRouter() + + +@router.get("/api/models") +async def list_models(user: dict = Depends(require_auth)): + """List selectable chat models for the composer's model picker. + + Returns: + { + "default": "", + "models": [{"id", "provider"}...], + "ollama": {"enabled", "routable", "available"} + } + + ``provider`` is ``"anthropic"`` or ``"ollama"``; the frontend formats + display labels. Discovery is best-effort — if the Ollama server is + unreachable the list simply contains no Ollama entries. + """ + config = get_config() + default_model = config.agent.model + + models: list[dict[str, str]] = [ + {"id": default_model, "provider": "anthropic"}, + ] + + ollama_available = False + if config.ollama_routable: + # Discovery does blocking I/O (stdlib urllib) — keep the event loop free. + names = await asyncio.to_thread(discover_models, config.ollama.base_url) + ollama_available = bool(names) + models.extend({"id": name, "provider": "ollama"} for name in names) + + return { + "default": default_model, + "models": models, + "ollama": { + "enabled": config.ollama.enabled, + "routable": config.ollama_routable, + "available": ollama_available, + }, + } diff --git a/nerve/gateway/server.py b/nerve/gateway/server.py index f0ba694..b9e27f1 100644 --- a/nerve/gateway/server.py +++ b/nerve/gateway/server.py @@ -128,6 +128,13 @@ async def lifespan(app: FastAPI): except Exception as e: logger.error("CLIProxyAPI proxy failed to start: %s", e) raise + elif config.ollama.enabled: + # Ollama needs the proxy as its Anthropic↔OpenAI translation layer. + logger.warning( + "ollama.enabled is true but proxy.enabled is false — Ollama " + "models require the CLIProxyAPI proxy and will NOT be offered. " + "Set proxy.enabled: true to use local Ollama models.", + ) # Initialize database db_path = Path("~/.nerve/nerve.db").expanduser() @@ -580,6 +587,9 @@ async def ws_broadcast(session_id: str, message: dict): user_text = data.get("content", "") session_id = data.get("session_id", active_session) file_ids = data.get("file_ids", []) + # Optional per-message model override from the composer's + # model picker (Anthropic default or a local Ollama model). + selected_model = data.get("model") or None if session_id != active_session: # Switch sessions @@ -603,6 +613,7 @@ async def ws_broadcast(session_id: str, message: dict): user_message=user_text, source="web", channel="web", + model=selected_model, images=images or None, image_refs=image_refs or None, ) diff --git a/nerve/ollama.py b/nerve/ollama.py new file mode 100644 index 0000000..ece8c7d --- /dev/null +++ b/nerve/ollama.py @@ -0,0 +1,63 @@ +"""Local Ollama integration helpers. + +Ollama exposes an OpenAI-compatible API at ``/v1`` and a native API at +``/api/*``. The Claude Agent SDK only speaks the Anthropic Messages API, +so Ollama models are reached through the bundled CLIProxyAPI (registered +as an ``openai-compatibility`` upstream). This module only handles model +*discovery* — querying which models are installed on the local server so +they can be offered in the model picker. + +Discovery is best-effort and never raises: if the Ollama server is down or +unreachable, callers get an empty list and Ollama simply contributes no +models to the picker. +""" + +from __future__ import annotations + +import json +import logging +import urllib.error +import urllib.request + +logger = logging.getLogger(__name__) + +# Short, bounded timeout — discovery runs on the request path (model +# picker) and on proxy-config writes; we never want it to hang the UI or +# block startup if Ollama is installed-but-not-running. +_DISCOVERY_TIMEOUT = 3.0 + + +def discover_models(base_url: str, timeout: float = _DISCOVERY_TIMEOUT) -> list[str]: + """Return model names installed on a local Ollama server. + + Queries Ollama's native ``GET /api/tags`` endpoint. Returns a sorted, + de-duplicated list of model names, or an empty list (never raises) when + the server is unreachable or the response is malformed. + + Uses the stdlib ``urllib`` so it is safe to call synchronously from a + worker thread (e.g. the proxy-config writer) without pulling in an + async HTTP client. + + Args: + base_url: Native Ollama base URL, e.g. ``http://127.0.0.1:11434``. + timeout: Per-request timeout in seconds. + """ + url = base_url.rstrip("/") + "/api/tags" + try: + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 + payload = json.loads(resp.read().decode("utf-8")) + except (urllib.error.URLError, OSError, ValueError, json.JSONDecodeError) as e: + logger.warning("Ollama model discovery failed at %s: %s", url, e) + return [] + + if not isinstance(payload, dict): + return [] + + names: set[str] = set() + for entry in payload.get("models") or []: + if isinstance(entry, dict): + name = entry.get("name") + if name: + names.add(str(name)) + return sorted(names) diff --git a/nerve/proxy/service.py b/nerve/proxy/service.py index 8d32cf4..1e57ade 100644 --- a/nerve/proxy/service.py +++ b/nerve/proxy/service.py @@ -151,12 +151,55 @@ def _write_proxy_config(self) -> Path: "request-retry": 3, } + # Register a local Ollama server as an OpenAI-compatible upstream so + # its models become selectable. CLIProxyAPI translates the Anthropic + # requests the SDK emits into OpenAI calls against Ollama's /v1 API. + ollama_provider = self._build_ollama_provider() + if ollama_provider is not None: + proxy_cfg["openai-compatibility"] = [ollama_provider] + self._config_path.parent.mkdir(parents=True, exist_ok=True) with open(self._config_path, "w") as f: yaml.safe_dump(proxy_cfg, f, default_flow_style=False, sort_keys=False) return self._config_path + def _build_ollama_provider(self) -> dict[str, Any] | None: + """Build the CLIProxyAPI ``openai-compatibility`` entry for Ollama. + + Returns ``None`` when Ollama is disabled or no models are installed. + Models are auto-discovered from Ollama's ``/api/tags`` so the picker + reflects whatever is pulled locally. Each model is exposed under its + own name as the alias the client selects. + """ + ollama = self.config.ollama + if not ollama.enabled: + return None + + from nerve.ollama import discover_models + + models = discover_models(ollama.base_url) + if not models: + logger.warning( + "Ollama enabled but no models discovered at %s — the local " + "server may be down or have no models pulled. Skipping the " + "Ollama proxy upstream.", + ollama.base_url, + ) + return None + + logger.info( + "Registering Ollama upstream (%s) with %d model(s): %s", + ollama.openai_base_url, len(models), ", ".join(models), + ) + return { + "name": "ollama", + "base-url": ollama.openai_base_url, + # Ollama ignores the API key, but CLIProxyAPI requires an entry. + "api-key-entries": [{"api-key": "ollama"}], + "models": [{"name": m, "alias": m} for m in models], + } + # ------------------------------------------------------------------ # # Lifecycle # # ------------------------------------------------------------------ # diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 58f5138..29302ab 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -63,6 +63,15 @@ export const api = { authStatus: () => request<{ auth_required: boolean }>('/auth/status'), + // Models — chat models offered to the composer's picker (Anthropic default + // plus any locally-installed Ollama models, auto-discovered server-side). + getModels: () => + request<{ + default: string; + models: { id: string; provider: string }[]; + ollama: { enabled: boolean; routable: boolean; available: boolean }; + }>('/models'), + // Sessions listSessions: () => request<{ sessions: any[] }>('/sessions'), searchSessions: (q: string) => diff --git a/web/src/api/websocket.ts b/web/src/api/websocket.ts index 9765b50..f0877b6 100644 --- a/web/src/api/websocket.ts +++ b/web/src/api/websocket.ts @@ -126,11 +126,16 @@ export class NerveWebSocket { return 'dropped'; } - sendMessage(content: string, sessionId: string, fileIds?: string[]): SendStatus { + sendMessage(content: string, sessionId: string, fileIds?: string[], model?: string): SendStatus { const msg: Record = { type: 'message', content, session_id: sessionId }; if (fileIds && fileIds.length > 0) { msg.file_ids = fileIds; } + // Per-message model override from the composer's picker (omitted → server + // uses the configured default). May be an Anthropic id or an Ollama model. + if (model) { + msg.model = model; + } return this.send(msg); } diff --git a/web/src/components/Chat/ChatInput.tsx b/web/src/components/Chat/ChatInput.tsx index 33733ff..73d3b71 100644 --- a/web/src/components/Chat/ChatInput.tsx +++ b/web/src/components/Chat/ChatInput.tsx @@ -59,6 +59,13 @@ export function ChatInput({ onSend, onStop, isStreaming, disabled }: { const activeSession = useChatStore(s => s.activeSession); const isNewChat = useChatStore(s => s.messages.length === 0); + // ── Model picker ── + const availableModels = useChatStore(s => s.availableModels); + const selectedModel = useChatStore(s => s.selectedModel); + const modelsDefault = useChatStore(s => s.modelsDefault); + const setSelectedModel = useChatStore(s => s.setSelectedModel); + const loadModels = useChatStore(s => s.loadModels); + const [prevQuoteCount, setPrevQuoteCount] = useState(0); // ── Prompt rewrite ── @@ -77,6 +84,10 @@ export function ChatInput({ onSend, onStop, isStreaming, disabled }: { .catch(() => setRewriteAvailable(false)); }, []); + // Load selectable models once — the picker only renders when more than the + // default model is offered (i.e. local Ollama models are configured). + useEffect(() => { loadModels(); }, [loadModels]); + useEffect(() => { localStorage.setItem(REWRITE_PREF_KEY, rewriteEnabled ? '1' : '0'); }, [rewriteEnabled]); @@ -437,6 +448,34 @@ export function ChatInput({ onSend, onStop, isStreaming, disabled }: { )} + {/* Model picker — only when more than one model is offered (local + Ollama models configured + available). Hidden otherwise so the + composer is unchanged for the default single-model setup. */} + {availableModels.length > 1 && ( + + )} + Promise; switchSession: (id: string) => Promise; @@ -144,6 +150,10 @@ interface ChatState { /** Trigger the sidebar to mount + focus the search input (used by Cmd+K). */ requestSearchFocus: () => void; sendMessage: (content: string) => void; + /** Fetch selectable models for the composer picker (GET /api/models). */ + loadModels: () => Promise; + /** Set the model for the next message (null → server default). */ + setSelectedModel: (model: string | null) => void; stopSession: () => void; handleWSMessage: (msg: WSMessage) => void; addQuote: (text: string, action: QuoteAction) => void; @@ -194,6 +204,9 @@ export const useChatStore = create((set, get) => ({ searchResults: null, searchLoading: false, searchFocusNonce: 0, + availableModels: [], + modelsDefault: null, + selectedModel: localStorage.getItem('nerve_selected_model') || null, addQuote: (text: string, action: QuoteAction) => { const id = `q${++_quoteId}`; @@ -536,6 +549,29 @@ export const useChatStore = create((set, get) => ({ set(s => ({ searchFocusNonce: s.searchFocusNonce + 1 })); }, + loadModels: async () => { + try { + const res = await api.getModels(); + set((state) => { + // Drop a stale pick (e.g. an Ollama model no longer installed) so we + // never send a model the server can't route. + const ids = new Set(res.models.map(m => m.id)); + const keep = state.selectedModel && ids.has(state.selectedModel) + ? state.selectedModel : null; + if (keep !== state.selectedModel) localStorage.removeItem('nerve_selected_model'); + return { availableModels: res.models, modelsDefault: res.default, selectedModel: keep }; + }); + } catch (e) { + console.error('Failed to load models:', e); + } + }, + + setSelectedModel: (model: string | null) => { + if (model) localStorage.setItem('nerve_selected_model', model); + else localStorage.removeItem('nerve_selected_model'); + set({ selectedModel: model }); + }, + sendMessage: async (content: string, fileIds?: string[], imageBlocks?: Array<{ url: string; filename: string; media_type: string }>) => { let session = get().activeSession; const blocks: import('../types/chat').MessageBlock[] = []; @@ -592,7 +628,7 @@ export const useChatStore = create((set, get) => ({ return; } } - const status = ws.sendMessage(content, session, fileIds); + const status = ws.sendMessage(content, session, fileIds, get().selectedModel ?? undefined); if (status === 'dropped') { // The message could not reach the server. Revert the optimistic // state and surface the failure inline so the user knows to retry.