Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,25 @@ timezone: America/New_York
# aws_region: us-east-1 # AWS region for Bedrock
# # aws_profile: "" # Optional: AWS SSO profile name

# Local proxy (CLIProxyAPI) — optional. Routes Anthropic API calls through
# Claude Code OAuth, and is the Anthropic↔OpenAI translation layer that local
# Ollama models are reached through. Required for the ollama block below.
# proxy:
# enabled: false
# port: 8317
# host: 127.0.0.1

# Local Ollama — expose locally-installed Ollama models as selectable chat
# models in the web composer's model picker. Ollama speaks an OpenAI-compatible
# API, so this requires proxy.enabled: true (the proxy translates the
# Anthropic requests the SDK emits). Models are auto-discovered at runtime
# from the running Ollama server (GET /api/tags) — whatever you've pulled
# locally shows up automatically, no need to list models here.
# ollama:
# enabled: false
# host: 127.0.0.1
# port: 11434

# Agent
agent:
model: claude-opus-4-8 # Primary model for conversations
Expand Down
44 changes: 37 additions & 7 deletions nerve/agent/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ def __init__(self, config: NerveConfig, db: Database):
# Read by session-scoped tools (send_file) to avoid dispatching via
# stale router context from a prior inbound channel.
self._active_channel: dict[str, str] = {}
# Resolved model bound to each session's live SDK client. Used to
# detect mid-session model switches (the CLI fixes its model at
# connect time, so a change requires recreating the client).
self._session_models: dict[str, str] = {}
self._router = None # ChannelRouter — lazy-initialized via .router property
self._mcp_servers_cache = list(config.mcp_servers) # hot-reloadable
self._claude_code_plugins: list[dict[str, str]] = [] # plugin dirs
Expand Down Expand Up @@ -973,19 +977,28 @@ def _build_options(
else:
system_prompt = system_prompt_str

thinking_config = self._parse_thinking_config(
self.config.agent.thinking,
model or self.config.agent.model,
# Local Ollama models are reached through the proxy and speak the
# OpenAI-translated API — Anthropic-only knobs (extended thinking,
# effort, the context-1m beta) don't apply and may break translation,
# so suppress them for non-Claude models.
selected_model = model or self.config.agent.model
is_ollama_model = (
self.config.ollama.enabled and "claude" not in selected_model.lower()
)
effort = self._effective_effort(
self.config.agent.effort,
model or self.config.agent.model,

thinking_config = (
None if is_ollama_model
else self._parse_thinking_config(self.config.agent.thinking, selected_model)
)
effort = (
None if is_ollama_model
else self._effective_effort(self.config.agent.effort, selected_model)
)
# Some subscriptions reject the context-1m beta for specific models
# (e.g. claude-sonnet-4-6) — skip the beta header for those.
betas = (
["context-1m-2025-08-07"]
if self.config.agent.context_1m_enabled_for(model)
if not is_ollama_model and self.config.agent.context_1m_enabled_for(model)
else []
)

Expand Down Expand Up @@ -1468,7 +1481,9 @@ async def _get_or_create_client(
lock = self.sessions.get_lock(session_id)
async with lock:
client = self.sessions.get_client(session_id)
requested_model = model or self.config.agent.model
if client is not None:
bound_model = self._session_models.get(session_id)
# Health check: verify the underlying CLI process is still alive
if self._is_client_dead(client):
logger.warning(
Expand All @@ -1480,6 +1495,20 @@ async def _get_or_create_client(
unregister_handler(session_id)
await self._safe_disconnect(client)
client = None
elif bound_model is not None and bound_model != requested_model:
# Model switched mid-session (e.g. the composer's picker
# moved from the Anthropic default to a local Ollama
# model). The CLI binds its model at connect time, so
# tear the client down and recreate it below.
logger.info(
"Session %s model changed (%s → %s), recreating client",
session_id, bound_model, requested_model,
)
self._stop_idle_watcher(session_id)
self.sessions.remove_client(session_id)
unregister_handler(session_id)
await self._safe_disconnect(client)
client = None
else:
return client

Expand Down Expand Up @@ -1568,6 +1597,7 @@ async def _get_or_create_client(

# Record connected_at and the resolved model
resolved_model = options.model
self._session_models[session_id] = resolved_model
now = datetime.now(timezone.utc).isoformat()
connected_at = session.get("connected_at") if session and sdk_resume_id else now
await self.sessions.mark_active(
Expand Down
53 changes: 53 additions & 0 deletions nerve/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,48 @@ def from_dict(cls, d: dict) -> ProxyConfig:
)


@dataclass
class OllamaConfig:
"""Local Ollama server — exposes its models as selectable chat models.

Ollama speaks an OpenAI-compatible API (``/v1``), not the Anthropic
Messages API the Claude Agent SDK uses. So Ollama models are routed
through the bundled CLIProxyAPI, which translates Anthropic ↔ OpenAI
and is registered with Ollama as an ``openai-compatibility`` upstream.

Requirement: this only takes effect when the proxy is also enabled
(``proxy.enabled: true``) — the proxy is the translation layer. When
``enabled`` is true but the proxy is off, Ollama models are not offered
(a warning is logged at startup).

Models are auto-discovered at runtime from Ollama's native
``GET /api/tags`` endpoint, so whatever you have pulled locally shows
up in the model picker with no extra config.
"""

enabled: bool = False
host: str = "127.0.0.1"
port: int = 11434

@property
def base_url(self) -> str:
"""Native Ollama base URL (used for ``/api/tags`` discovery)."""
return f"http://{self.host}:{self.port}"

@property
def openai_base_url(self) -> str:
"""OpenAI-compatible base URL (registered as a proxy upstream)."""
return f"http://{self.host}:{self.port}/v1"

@classmethod
def from_dict(cls, d: dict) -> OllamaConfig:
return cls(
enabled=bool(d.get("enabled", False)),
host=d.get("host", "127.0.0.1"),
port=int(d.get("port", 11434)),
)


@dataclass
class McpEndpointConfig:
"""Nerve's own MCP server endpoint (Nerve-as-MCP-server).
Expand Down Expand Up @@ -1063,6 +1105,7 @@ class NerveConfig:
notifications: NotificationsConfig = field(default_factory=NotificationsConfig)
docker: DockerConfig = field(default_factory=DockerConfig)
proxy: ProxyConfig = field(default_factory=ProxyConfig)
ollama: OllamaConfig = field(default_factory=OllamaConfig)
houseofagents: HouseOfAgentsConfig = field(default_factory=HouseOfAgentsConfig)
langfuse: LangfuseConfig = field(default_factory=LangfuseConfig)
xmemory: XmemoryConfig = field(default_factory=XmemoryConfig)
Expand Down Expand Up @@ -1098,6 +1141,15 @@ def effective_api_key(self) -> str:
return self.proxy.api_key
return self.anthropic_api_key

@property
def ollama_routable(self) -> bool:
"""True when Ollama models can actually be served.

Requires both Ollama enabled and the proxy running (the proxy is
the Anthropic↔OpenAI translation layer Ollama is reached through).
"""
return self.ollama.enabled and self.proxy.enabled

def create_anthropic_client(self, timeout: float = 60.0) -> Any:
"""Create an Anthropic client based on the configured provider.

Expand Down Expand Up @@ -1180,6 +1232,7 @@ def from_dict(cls, d: dict) -> NerveConfig:
notifications=NotificationsConfig.from_dict(d.get("notifications", {})),
docker=DockerConfig.from_dict(d.get("docker", {})),
proxy=ProxyConfig.from_dict(d.get("proxy", {})),
ollama=OllamaConfig.from_dict(d.get("ollama", {})),
houseofagents=HouseOfAgentsConfig.from_dict(d.get("houseofagents", {})),
langfuse=LangfuseConfig.from_dict(d.get("langfuse", {})),
xmemory=XmemoryConfig.from_dict(d.get("xmemory", {})),
Expand Down
2 changes: 2 additions & 0 deletions nerve/gateway/routes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
files,
external_agents,
prompt_rewrite,
models,
)

__all__ = [
Expand Down Expand Up @@ -61,4 +62,5 @@ def register_all_routes() -> APIRouter:
router.include_router(files.router)
router.include_router(external_agents.router)
router.include_router(prompt_rewrite.router)
router.include_router(models.router)
return router
65 changes: 65 additions & 0 deletions nerve/gateway/routes/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Model discovery routes — which chat models the UI can offer.

Exposes the configured Anthropic chat model plus any locally-installed
Ollama models (auto-discovered from the running Ollama server). The web
composer's model picker calls GET /api/models to populate its options.

Ollama models are only listed when they are actually routable
(``config.ollama_routable`` — Ollama enabled *and* the proxy running),
so the picker never offers a model that would fail on send.
"""

from __future__ import annotations

import asyncio
import logging

from fastapi import APIRouter, Depends

from nerve.config import get_config
from nerve.gateway.auth import require_auth
from nerve.ollama import discover_models

logger = logging.getLogger(__name__)

router = APIRouter()


@router.get("/api/models")
async def list_models(user: dict = Depends(require_auth)):
"""List selectable chat models for the composer's model picker.

Returns:
{
"default": "<anthropic model id>",
"models": [{"id", "provider"}...],
"ollama": {"enabled", "routable", "available"}
}

``provider`` is ``"anthropic"`` or ``"ollama"``; the frontend formats
display labels. Discovery is best-effort — if the Ollama server is
unreachable the list simply contains no Ollama entries.
"""
config = get_config()
default_model = config.agent.model

models: list[dict[str, str]] = [
{"id": default_model, "provider": "anthropic"},
]

ollama_available = False
if config.ollama_routable:
# Discovery does blocking I/O (stdlib urllib) — keep the event loop free.
names = await asyncio.to_thread(discover_models, config.ollama.base_url)
ollama_available = bool(names)
models.extend({"id": name, "provider": "ollama"} for name in names)

return {
"default": default_model,
"models": models,
"ollama": {
"enabled": config.ollama.enabled,
"routable": config.ollama_routable,
"available": ollama_available,
},
}
11 changes: 11 additions & 0 deletions nerve/gateway/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.error("CLIProxyAPI proxy failed to start: %s", e)
raise
elif config.ollama.enabled:
# Ollama needs the proxy as its Anthropic↔OpenAI translation layer.
logger.warning(
"ollama.enabled is true but proxy.enabled is false — Ollama "
"models require the CLIProxyAPI proxy and will NOT be offered. "
"Set proxy.enabled: true to use local Ollama models.",
)

# Initialize database
db_path = Path("~/.nerve/nerve.db").expanduser()
Expand Down Expand Up @@ -580,6 +587,9 @@ async def ws_broadcast(session_id: str, message: dict):
user_text = data.get("content", "")
session_id = data.get("session_id", active_session)
file_ids = data.get("file_ids", [])
# Optional per-message model override from the composer's
# model picker (Anthropic default or a local Ollama model).
selected_model = data.get("model") or None

if session_id != active_session:
# Switch sessions
Expand All @@ -603,6 +613,7 @@ async def ws_broadcast(session_id: str, message: dict):
user_message=user_text,
source="web",
channel="web",
model=selected_model,
images=images or None,
image_refs=image_refs or None,
)
Expand Down
63 changes: 63 additions & 0 deletions nerve/ollama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Local Ollama integration helpers.

Ollama exposes an OpenAI-compatible API at ``/v1`` and a native API at
``/api/*``. The Claude Agent SDK only speaks the Anthropic Messages API,
so Ollama models are reached through the bundled CLIProxyAPI (registered
as an ``openai-compatibility`` upstream). This module only handles model
*discovery* — querying which models are installed on the local server so
they can be offered in the model picker.

Discovery is best-effort and never raises: if the Ollama server is down or
unreachable, callers get an empty list and Ollama simply contributes no
models to the picker.
"""

from __future__ import annotations

import json
import logging
import urllib.error
import urllib.request

logger = logging.getLogger(__name__)

# Short, bounded timeout — discovery runs on the request path (model
# picker) and on proxy-config writes; we never want it to hang the UI or
# block startup if Ollama is installed-but-not-running.
_DISCOVERY_TIMEOUT = 3.0


def discover_models(base_url: str, timeout: float = _DISCOVERY_TIMEOUT) -> list[str]:
"""Return model names installed on a local Ollama server.

Queries Ollama's native ``GET /api/tags`` endpoint. Returns a sorted,
de-duplicated list of model names, or an empty list (never raises) when
the server is unreachable or the response is malformed.

Uses the stdlib ``urllib`` so it is safe to call synchronously from a
worker thread (e.g. the proxy-config writer) without pulling in an
async HTTP client.

Args:
base_url: Native Ollama base URL, e.g. ``http://127.0.0.1:11434``.
timeout: Per-request timeout in seconds.
"""
url = base_url.rstrip("/") + "/api/tags"
try:
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310
payload = json.loads(resp.read().decode("utf-8"))
except (urllib.error.URLError, OSError, ValueError, json.JSONDecodeError) as e:
logger.warning("Ollama model discovery failed at %s: %s", url, e)
return []

if not isinstance(payload, dict):
return []

names: set[str] = set()
for entry in payload.get("models") or []:
if isinstance(entry, dict):
name = entry.get("name")
if name:
names.add(str(name))
return sorted(names)
Loading
Loading