diff --git a/.cspell/words.txt b/.cspell/words.txt index e95fc06..f9f20fe 100644 --- a/.cspell/words.txt +++ b/.cspell/words.txt @@ -61,5 +61,6 @@ unresumable urandom venv vtable +xhigh xxhash yazi diff --git a/crates/oxide-code/src/client/anthropic.rs b/crates/oxide-code/src/client/anthropic.rs index cc70362..1e20c99 100644 --- a/crates/oxide-code/src/client/anthropic.rs +++ b/crates/oxide-code/src/client/anthropic.rs @@ -24,7 +24,7 @@ use tracing::debug; use uuid::Uuid; use super::billing; -use crate::config::{Auth, Config, ThinkingConfig}; +use crate::config::{Auth, Config, Effort, PromptCacheTtl, ThinkingConfig}; use crate::message::{ContentBlock, Message, Role}; use crate::prompt::SYSTEM_PROMPT_DYNAMIC_BOUNDARY; use crate::tool::ToolDefinition; @@ -39,8 +39,10 @@ const OAUTH_BETA_HEADER: &str = "oauth-2025-04-20"; const PROMPT_CACHING_SCOPE_BETA_HEADER: &str = "prompt-caching-scope-2026-01-05"; const STRUCTURED_OUTPUTS_BETA_HEADER: &str = "structured-outputs-2025-12-15"; -/// Matches the installed Claude Code version. -const CLAUDE_CLI_VERSION: &str = "2.1.101"; +/// Matches the installed Claude Code version. The rest of this PR is +/// pinned against 2.1.119 packet captures; keep the wire +/// `User-Agent` / `cc_version` claim aligned. +const CLAUDE_CLI_VERSION: &str = "2.1.119"; /// OAuth-required identity prefix. The Anthropic API returns 429 for non-Haiku /// models with OAuth tokens unless the system prompt starts with this exact @@ -64,20 +66,66 @@ struct CreateMessageRequest<'a> { tools: Option<&'a [ToolDefinition]>, #[serde(skip_serializing_if = "Option::is_none")] thinking: Option<&'a ThinkingConfig>, - /// JSON-schema-constrained output format for one-shot utility calls - /// (title generation, future classifiers). Must travel alongside the - /// `structured-outputs-2025-12-15` beta header; both are gated on - /// `Capabilities::structured_outputs` so unsupported models silently - /// drop back to free-form text rather than 400ing the gateway. + /// Carries both the `format` (JSON-schema-constrained output for + /// one-shot calls) and `effort` (agentic-path intelligence tier) + /// knobs. Wrapped in `Option` so an empty `OutputConfig` never + /// ships — callers build one via [`OutputConfig::new`] and pass + /// `None` when neither sub-field is set. #[serde(skip_serializing_if = "Option::is_none")] output_config: Option>, + /// `context_management.edits` — the client-side context-editing + /// directive that partners the `context-management-2025-06-27` + /// beta header. Populated on the streaming path for any model + /// with [`Capabilities::context_management`] set. + #[serde(skip_serializing_if = "Option::is_none")] + context_management: Option, messages: &'a [Message], } -/// Wrapper matching the wire shape `output_config.format = {...}`. +/// Shared wrapper for the `output_config` body field. Either field +/// may be absent; when both are, [`Self::new`] returns `None` so the +/// builder never ships an empty object. #[derive(Serialize)] struct OutputConfig<'a> { - format: &'a OutputFormat, + #[serde(skip_serializing_if = "Option::is_none")] + format: Option<&'a OutputFormat>, + #[serde(skip_serializing_if = "Option::is_none")] + effort: Option, +} + +impl<'a> OutputConfig<'a> { + /// Returns `None` when every field is empty so callers can avoid + /// shipping a bare `{}`. `Some(_)` otherwise. + fn new(format: Option<&'a OutputFormat>, effort: Option) -> Option { + (format.is_some() || effort.is_some()).then_some(Self { format, effort }) + } +} + +/// `context_management.edits` body field. oxide-code mirrors +/// claude-code 2.1.119's observed wire shape — a single +/// `clear_thinking_20251015` edit with `keep = "all"` on every +/// agentic request that also ships the matching beta header. +#[derive(Serialize)] +struct ContextManagement { + edits: [ContextEdit; 1], +} + +impl ContextManagement { + /// Wire shape claude-code 2.1.119 sends on every 4.6+ request. + /// Single place to edit when Anthropic ships newer edit types or + /// we need to diverge from the default. + fn clear_thinking_keep_all() -> Self { + Self { + edits: [ContextEdit::ClearThinking20251015 { keep: "all" }], + } + } +} + +#[derive(Serialize)] +#[serde(tag = "type")] +enum ContextEdit { + #[serde(rename = "clear_thinking_20251015")] + ClearThinking20251015 { keep: &'static str }, } /// JSON-schema-constrained completion format. Constructed via @@ -128,6 +176,8 @@ struct SystemBlock<'a> { /// Prompt caching control. The `scope` field determines the cache sharing /// level: `"global"` for static content identical across sessions (1P only), /// `None` for the default org-scoped ephemeral cache (universally accepted). +/// The `ttl` field overrides the server default (5 m as of 2026-03) — +/// oxide-code defaults to `"1h"`, opt-out via `prompt_cache_ttl = "5m"`. /// /// `scope: "global"` must be a true prefix of all preceding request content /// — the server rejects a global-scoped block preceded by a non-global @@ -138,6 +188,8 @@ struct CacheControl { r#type: &'static str, #[serde(skip_serializing_if = "Option::is_none")] scope: Option<&'static str>, + #[serde(skip_serializing_if = "Option::is_none")] + ttl: Option<&'static str>, } // ── SSE response types ── @@ -438,7 +490,10 @@ impl Client { system_blocks.push(SystemBlock { r#type: "text", text: &static_joined, - cache_control: Some(static_prefix_cache_control(is_first_party)), + cache_control: Some(static_prefix_cache_control( + is_first_party, + self.config.prompt_cache_ttl, + )), }); } if !dynamic_joined.is_empty() { @@ -449,6 +504,8 @@ impl Client { }); } + let caps = crate::model::capabilities_for(&self.config.model); + let url = format!("{}/v1/messages?beta=true", self.config.base_url); let mut body = serde_json::to_string(&CreateMessageRequest { // `[1m]` is a client-side tag; strip before the wire. @@ -459,7 +516,14 @@ impl Client { system: system_blocks, tools: (!tools.is_empty()).then_some(tools), thinking: self.config.thinking.as_ref(), - output_config: None, + output_config: OutputConfig::new(None, self.config.effort), + // Gated on the same capability flag as the + // `context-management-2025-06-27` beta header so body and + // header stay in sync — claude-code 2.1.119 ships them + // together on every 4.6+ agentic request. + context_management: caps + .context_management + .then(ContextManagement::clear_thinking_keep_all), messages: effective_messages, }) .context("failed to serialize request")?; @@ -587,9 +651,7 @@ fn compute_betas( want_structured: bool, is_first_party: bool, ) -> Vec<&'static str> { - let caps = crate::model::lookup(model) - .map(|info| info.capabilities) - .unwrap_or_default(); + let caps = crate::model::capabilities_for(model); let is_haiku = model.to_lowercase().contains("haiku"); // Order mirrors `docs/research/anthropic-api.md` → Per-model beta @@ -642,7 +704,7 @@ fn compute_betas( /// Whether the target model accepts the `structured-outputs-2025-12-15` /// beta. Thin wrapper over the capability table for pre-checks. pub(crate) fn supports_structured_outputs(model: &str) -> bool { - crate::model::lookup(model).is_some_and(|info| info.capabilities.structured_outputs) + crate::model::capabilities_for(model).structured_outputs } /// Whether `base_url` points at the first-party Anthropic API, gating @@ -669,11 +731,13 @@ fn is_first_party_base_url(base_url: &str) -> bool { /// global scope so the prefix is shared across sessions; on 3P, fall /// back to the default (org-scoped) ephemeral cache — 3P gateways /// reject `scope: "global"` because tool definitions render first and -/// taint the cache prefix. -fn static_prefix_cache_control(is_first_party: bool) -> CacheControl { +/// taint the cache prefix. `ttl` overrides the server default (5 m) +/// when set via `config.prompt_cache_ttl`. +fn static_prefix_cache_control(is_first_party: bool, ttl: PromptCacheTtl) -> CacheControl { CacheControl { r#type: "ephemeral", scope: is_first_party.then_some("global"), + ttl: ttl.wire(), } } @@ -754,7 +818,10 @@ fn build_completion_body( system: system_blocks, tools: None, thinking: None, - output_config: output_format.map(|format| OutputConfig { format }), + output_config: OutputConfig::new(output_format, None), + // One-shot completions never opt into context management — + // matches claude-code's one-shot path. + context_management: None, messages: &messages, }) .context("failed to serialize request")?; @@ -966,9 +1033,11 @@ fn parse_sse_frame(frame: &str) -> Result> { pub(crate) fn test_config(base_url: impl Into, auth: Auth, model: &str) -> Config { Config { auth, - model: model.to_owned(), base_url: base_url.into(), + model: model.to_owned(), + effort: None, max_tokens: 128, + prompt_cache_ttl: PromptCacheTtl::OneHour, thinking: None, show_thinking: false, } @@ -1021,13 +1090,17 @@ mod tests { Auth::OAuth("t".to_owned()) } - /// Concatenates SSE frames into a valid response body, each - /// followed by the required `\n\n` terminator. - fn sse_body(frames: &[&str]) -> String { + /// Builds an SSE response body from `(event, data)` pairs. Each + /// frame is emitted as `event: \ndata: \n\n`, encoding + /// the frame-separator invariant in one place so call sites don't + /// hand-roll it (and can't silently omit the `\n\n`). + fn sse_body(frames: &[(&str, &str)]) -> String { + use std::fmt::Write; let mut body = String::new(); - for f in frames { - body.push_str(f); - body.push_str("\n\n"); + for (event, data) in frames { + writeln!(body, "event: {event}").unwrap(); + writeln!(body, "data: {data}").unwrap(); + body.push('\n'); } body } @@ -1045,18 +1118,27 @@ mod tests { /// Well-formed SSE body for a short text response. fn text_stream_body() -> String { sse_body(&[ - r#"event: message_start -data: {"type":"message_start","message":{"id":"msg_1","model":"claude-sonnet-4-6","usage":{"input_tokens":5,"output_tokens":0}}}"#, - r#"event: content_block_start -data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}"#, - r#"event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hi"}}"#, - r#"event: content_block_stop -data: {"type":"content_block_stop","index":0}"#, - r#"event: message_delta -data: {"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":1}}"#, - r#"event: message_stop -data: {"type":"message_stop"}"#, + ( + "message_start", + r#"{"type":"message_start","message":{"id":"msg_1","model":"claude-sonnet-4-6","usage":{"input_tokens":5,"output_tokens":0}}}"#, + ), + ( + "content_block_start", + r#"{"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}"#, + ), + ( + "content_block_delta", + r#"{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hi"}}"#, + ), + ( + "content_block_stop", + r#"{"type":"content_block_stop","index":0}"#, + ), + ( + "message_delta", + r#"{"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":1}}"#, + ), + ("message_stop", r#"{"type":"message_stop"}"#), ]) } @@ -1369,12 +1451,15 @@ data: {"type":"message_stop"}"#, // would mangle a 4-byte emoji split across TCP chunk boundaries. let server = MockServer::start().await; let body = sse_body(&[ - r#"event: content_block_start -data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}"#, - r#"event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"🦀rust"}}"#, - r#"event: message_stop -data: {"type":"message_stop"}"#, + ( + "content_block_start", + r#"{"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}"#, + ), + ( + "content_block_delta", + r#"{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"🦀rust"}}"#, + ), + ("message_stop", r#"{"type":"message_stop"}"#), ]); Mock::given(method("POST")) .and(path("/v1/messages")) @@ -1414,14 +1499,16 @@ data: {"type":"message_stop"}"#, // one bad frame cannot poison the whole turn. let server = MockServer::start().await; let body = sse_body(&[ - r#"event: content_block_start -data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}"#, - r"event: content_block_delta -data: {not valid json", - r#"event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hi"}}"#, - r#"event: message_stop -data: {"type":"message_stop"}"#, + ( + "content_block_start", + r#"{"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}"#, + ), + ("content_block_delta", "{not valid json"), + ( + "content_block_delta", + r#"{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hi"}}"#, + ), + ("message_stop", r#"{"type":"message_stop"}"#), ]); Mock::given(method("POST")) .and(path("/v1/messages")) @@ -1460,8 +1547,10 @@ data: {"type":"message_stop"}"#, // `StreamEvent::Error` flows as `Ok(Error { .. })` on the channel; // the caller (`agent.rs`) converts it to a bail!. let server = MockServer::start().await; - let body = sse_body(&[r#"event: error -data: {"type":"error","error":{"type":"overloaded_error","message":"Servers overloaded"}}"#]); + let body = sse_body(&[( + "error", + r#"{"type":"error","error":{"type":"overloaded_error","message":"Servers overloaded"}}"#, + )]); Mock::given(method("POST")) .and(path("/v1/messages")) .respond_with( @@ -1548,7 +1637,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over let mut rx = client .stream_message(&[Message::user("hi")], &[], None, &[]) .unwrap(); - let _ = rx.recv().await; + _ = rx.recv().await; drop(rx); // Lets the background task observe the closed channel and exit; // any panic would surface in test output. @@ -1776,6 +1865,128 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over cc.get("scope").is_none(), "scope field omitted entirely on 3P (not null): {body}", ); + // TTL rides through on 3P — only `scope` is gated on 1P. + assert_eq!(cc["ttl"], "1h", "default 1h ttl survives on 3P: {body}"); + } + + // ── Client::stream_message / agentic body fields ── + + /// Captures the serialized body of a single streaming request. + /// Most agentic-body tests only care about what oxide-code sends, + /// not the response — this collapses the ceremony to two lines + /// per test. + async fn capture_stream_body(config: Config) -> serde_json::Value { + let server = MockServer::start().await; + let sink: Captured = captured(); + let sink_clone = std::sync::Arc::clone(&sink); + Mock::given(method("POST")) + .and(path("/v1/messages")) + .respond_with(move |req: &Request| { + *sink_clone.lock().unwrap() = Some(String::from_utf8_lossy(&req.body).into_owned()); + ResponseTemplate::new(200) + .insert_header("content-type", "text/event-stream") + .set_body_string(text_stream_body()) + }) + .mount(&server) + .await; + + let mut cfg = config; + cfg.base_url = server.uri(); + let client = Client::new(cfg, Some("sid".to_owned())).unwrap(); + collect_events( + client + .stream_message(&[Message::user("hi")], &[], None, &[]) + .unwrap(), + ) + .await + .unwrap(); + let body = sink.lock().unwrap().clone().expect("request captured"); + serde_json::from_str(&body).unwrap() + } + + #[tokio::test] + async fn stream_message_opus_4_7_emits_output_config_effort_xhigh() { + let mut cfg = test_config("https://placeholder.invalid", api_key(), "claude-opus-4-7"); + cfg.effort = Some(Effort::Xhigh); + let body = capture_stream_body(cfg).await; + assert_eq!(body["output_config"]["effort"], "xhigh"); + } + + #[tokio::test] + async fn stream_message_omits_output_config_when_effort_is_none() { + // Non-effort-capable model → `Config.effort == None` → the + // whole `output_config` block is absent (not `{}`). + let cfg = test_config( + "https://placeholder.invalid", + api_key(), + "claude-sonnet-4-5", + ); + assert!(cfg.effort.is_none(), "precondition: effort unset"); + let body = capture_stream_body(cfg).await; + assert!( + body.get("output_config").is_none(), + "output_config absent: {body}", + ); + } + + #[tokio::test] + async fn stream_message_context_management_body_present_on_4_6_plus() { + // Every model whose `context_management` capability flag is + // set must also ship the body directive alongside the beta + // header. + for model in [ + "claude-opus-4-7", + "claude-opus-4-6", + "claude-sonnet-4-6", + "claude-haiku-4-5", + ] { + let cfg = test_config("https://placeholder.invalid", api_key(), model); + let body = capture_stream_body(cfg).await; + let edits = body["context_management"]["edits"] + .as_array() + .unwrap_or_else(|| panic!("context_management.edits missing for {model}: {body}")); + assert_eq!(edits.len(), 1, "{model}"); + assert_eq!(edits[0]["type"], "clear_thinking_20251015", "{model}"); + assert_eq!(edits[0]["keep"], "all", "{model}"); + } + } + + #[tokio::test] + async fn stream_message_context_management_absent_on_unknown_model() { + // Unknown model ids (no `MODELS` row matches) fall back to + // the all-false `Capabilities::default()` — no beta, no body + // directive. Keeps "beta sent ⇒ body populated" an invariant. + let cfg = test_config("https://placeholder.invalid", api_key(), "claude-opus-5-0"); + let body = capture_stream_body(cfg).await; + assert!( + body.get("context_management").is_none(), + "context_management absent on unknown models: {body}", + ); + } + + #[tokio::test] + async fn stream_message_show_thinking_emits_display_summarized() { + let mut cfg = test_config("https://placeholder.invalid", api_key(), "claude-opus-4-7"); + cfg.thinking = Some(ThinkingConfig::Adaptive { + display: Some(crate::config::ThinkingDisplay::Summarized), + }); + let body = capture_stream_body(cfg).await; + assert_eq!(body["thinking"]["type"], "adaptive"); + assert_eq!(body["thinking"]["display"], "summarized"); + } + + #[tokio::test] + async fn stream_message_show_thinking_false_omits_display_field() { + // `Adaptive { display: None }` must serialize without a + // `display` key — `skip_serializing_if` on the wire. + let mut cfg = test_config("https://placeholder.invalid", api_key(), "claude-opus-4-7"); + cfg.thinking = Some(ThinkingConfig::Adaptive { display: None }); + let body = capture_stream_body(cfg).await; + assert_eq!(body["thinking"]["type"], "adaptive"); + assert!( + body["thinking"].get("display").is_none(), + "display field absent: {body}", + ); } // ── Client::complete ── @@ -1864,7 +2075,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over Some("sid".to_owned()), ) .unwrap(); - let _ = client + _ = client .complete(model, "sys", "prompt", 40, Some(&fmt)) .await .unwrap(); @@ -1916,7 +2127,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over Some("sid".to_owned()), ) .unwrap(); - let _ = client + _ = client .complete("claude-haiku-4-5", "", "hi", 40, None) .await .unwrap(); @@ -1934,6 +2145,41 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over assert!(!body.contains("cch=00000"), "cch populated: {body}"); } + #[tokio::test] + async fn complete_does_not_emit_context_management_edits() { + // `context_management.edits` is an agentic-path directive; it + // must stay off the one-shot `complete` path even on models + // that carry the capability flag (Haiku 4.5 here). + let server = MockServer::start().await; + let sink: Captured = captured(); + let sink_clone = std::sync::Arc::clone(&sink); + Mock::given(method("POST")) + .and(path("/v1/messages")) + .respond_with(move |req: &Request| { + *sink_clone.lock().unwrap() = Some(String::from_utf8_lossy(&req.body).into_owned()); + ResponseTemplate::new(200).set_body_string(completion_body("ok")) + }) + .mount(&server) + .await; + + let client = Client::new( + test_config(server.uri(), api_key(), "claude-haiku-4-5"), + Some("sid".to_owned()), + ) + .unwrap(); + _ = client + .complete("claude-haiku-4-5", "sys", "hi", 40, None) + .await + .unwrap(); + + let body = sink.lock().unwrap().clone().expect("body captured"); + let v: serde_json::Value = serde_json::from_str(&body).unwrap(); + assert!( + v.get("context_management").is_none(), + "context_management absent on one-shot path: {body}", + ); + } + // ── build_metadata ── #[test] @@ -1949,7 +2195,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over // ── compute_betas ── #[test] - fn compute_betas_agentic_opus_46_plain_carries_full_set_except_1m() { + fn compute_betas_agentic_opus_4_6_plain_carries_full_set_except_1m() { // Plain model (no `[1m]` tag) must not auto-enable 1M context — // a gateway without 1M access would 400. let betas = compute_betas("claude-opus-4-6", &api_key(), true, false, true); @@ -1964,7 +2210,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over } #[test] - fn compute_betas_opus_46_with_1m_tag_adds_context_1m() { + fn compute_betas_opus_4_6_with_1m_tag_adds_context_1m() { let betas = compute_betas("claude-opus-4-6[1m]", &api_key(), true, false, true); assert!(betas.contains(&CONTEXT_1M_BETA_HEADER)); assert!(betas.contains(&EFFORT_BETA_HEADER)); @@ -1977,7 +2223,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over } #[test] - fn compute_betas_sonnet_45_has_thinking_but_not_effort() { + fn compute_betas_sonnet_4_5_has_thinking_but_not_effort() { // Sonnet 4.5 supports interleaved thinking but not effort; // plain (no `[1m]` tag) means no 1M beta either. let betas = compute_betas("claude-sonnet-4-5", &api_key(), true, false, true); @@ -2030,7 +2276,7 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over } #[test] - fn compute_betas_opus_47_matches_opus_46_family() { + fn compute_betas_opus_4_7_matches_opus_4_6_family() { let plain = compute_betas("claude-opus-4-7", &api_key(), true, false, true); assert!(plain.contains(&INTERLEAVED_THINKING_BETA_HEADER)); assert!(plain.contains(&CONTEXT_MANAGEMENT_BETA_HEADER)); @@ -2105,22 +2351,30 @@ data: {"type":"error","error":{"type":"overloaded_error","message":"Servers over #[test] fn static_prefix_cache_control_emits_global_scope_on_first_party_only() { - // 1P → `{"type":"ephemeral","scope":"global"}` — global cache. - // 3P → `{"type":"ephemeral"}` — default (org) scope; every - // gateway accepts this. - let first = static_prefix_cache_control(true); + let first = static_prefix_cache_control(true, PromptCacheTtl::OneHour); assert_eq!(first.r#type, "ephemeral"); assert_eq!(first.scope, Some("global")); - let third = static_prefix_cache_control(false); + let third = static_prefix_cache_control(false, PromptCacheTtl::OneHour); assert_eq!(third.r#type, "ephemeral"); assert_eq!(third.scope, None); + } - // Round-trip through JSON to pin the on-wire shape — the - // `scope` key must be absent (not `null`) in the 3P case so - // gateways that validate the field strictly accept it. - let wire = serde_json::to_string(&third).unwrap(); - assert_eq!(wire, r#"{"type":"ephemeral"}"#); + #[test] + fn static_prefix_cache_control_ttl_matches_config() { + // 1h → `ttl: "1h"` in the wire. 5m → field absent entirely + // (matches server default; keeps the pre-2026-03 wire shape). + let one_hour = static_prefix_cache_control(false, PromptCacheTtl::OneHour); + assert_eq!( + serde_json::to_string(&one_hour).unwrap(), + r#"{"type":"ephemeral","ttl":"1h"}"#, + ); + + let five_min = static_prefix_cache_control(false, PromptCacheTtl::FiveMin); + assert_eq!( + serde_json::to_string(&five_min).unwrap(), + r#"{"type":"ephemeral"}"#, + ); } // ── api_model_id / has_1m_tag ── diff --git a/crates/oxide-code/src/config.rs b/crates/oxide-code/src/config.rs index d495f92..986d4bc 100644 --- a/crates/oxide-code/src/config.rs +++ b/crates/oxide-code/src/config.rs @@ -8,8 +8,11 @@ mod file; mod oauth; -use anyhow::{Context, Result}; -use serde::Serialize; +use std::fmt; +use std::str::FromStr; + +use anyhow::{Context, Result, bail}; +use serde::{Deserialize, Serialize}; use crate::util::env; @@ -28,16 +31,134 @@ pub enum Auth { #[derive(Debug, Clone, Serialize)] #[serde(tag = "type", rename_all = "snake_case")] pub enum ThinkingConfig { - /// Model decides the thinking budget (Claude 4.6+). - Adaptive, + /// Model decides the thinking budget (Claude 4.6+). `display` + /// controls what the API streams back: `Omitted` (4.7 default, + /// empty `thinking` field) or `Summarized` (the 4.6 default, and + /// what oxide-code enables whenever `show_thinking=true`). + Adaptive { + #[serde(skip_serializing_if = "Option::is_none")] + display: Option, + }, +} + +/// `thinking.display` values accepted by the API on 4.7+. Only +/// `Summarized` is ever emitted — omitting the field entirely (via +/// `display: None`) already yields the `omitted` default on 4.7. +#[derive(Debug, Clone, Copy, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ThinkingDisplay { + Summarized, +} + +/// Intelligence-vs-latency tier sent as `output_config.effort` on +/// effort-capable models. The per-model ceiling lives in +/// [`crate::model::Capabilities`]. +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum Effort { + Low, + Medium, + High, + Xhigh, + Max, +} + +impl Effort { + const fn as_str(self) -> &'static str { + match self { + Self::Low => "low", + Self::Medium => "medium", + Self::High => "high", + Self::Xhigh => "xhigh", + Self::Max => "max", + } + } +} + +impl fmt::Display for Effort { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl FromStr for Effort { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "low" => Ok(Self::Low), + "medium" => Ok(Self::Medium), + "high" => Ok(Self::High), + "xhigh" => Ok(Self::Xhigh), + "max" => Ok(Self::Max), + _ => bail!("invalid effort {s:?}; expected one of: low, medium, high, xhigh, max"), + } + } +} + +/// Prompt-cache TTL sent as `cache_control.ttl`. Anthropic silently +/// dropped the default from 1h to 5m on 2026-03-06, so `OneHour` is +/// explicit opt-in. oxide-code defaults to `OneHour`. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] +pub enum PromptCacheTtl { + #[serde(rename = "5m")] + FiveMin, + #[serde(rename = "1h")] + OneHour, +} + +impl PromptCacheTtl { + /// Wire value for `cache_control.ttl`. `None` when the TTL is + /// the server default (5 m) so the JSON omits the field entirely. + pub(crate) const fn wire(self) -> Option<&'static str> { + match self { + Self::FiveMin => None, + Self::OneHour => Some("1h"), + } + } + + const fn as_str(self) -> &'static str { + match self { + Self::FiveMin => "5m", + Self::OneHour => "1h", + } + } +} + +impl fmt::Display for PromptCacheTtl { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl FromStr for PromptCacheTtl { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "5m" => Ok(Self::FiveMin), + "1h" => Ok(Self::OneHour), + _ => bail!("invalid prompt_cache_ttl {s:?}; expected one of: 5m, 1h"), + } + } } +/// Resolved configuration. #[derive(Debug, Clone)] pub struct Config { pub auth: Auth, - pub model: String, pub base_url: String, + pub model: String, + /// `output_config.effort` for the streaming path. `None` means + /// the model doesn't accept the parameter and the field is + /// omitted. Resolved once at [`Config::load`] — callers forward. + pub effort: Option, pub max_tokens: u32, + /// `cache_control.ttl` for every cacheable block. Default is + /// [`PromptCacheTtl::OneHour`] since Anthropic's 2026-03 TTL + /// drop made the server default (5 m) a silent cost regression + /// on long sessions. + pub prompt_cache_ttl: PromptCacheTtl, pub thinking: Option, pub show_thinking: bool, } @@ -74,29 +195,66 @@ impl Config { .or(client.base_url) .unwrap_or_else(|| DEFAULT_BASE_URL.to_owned()); + let caps = crate::model::capabilities_for(&model); + + let effort_pick = match env::string("ANTHROPIC_EFFORT") { + Some(raw) => Some(raw.parse::().context("ANTHROPIC_EFFORT")?), + None => client.effort, + }; + let effort = match effort_pick { + Some(pick) => caps.clamp_effort(pick), + None => caps.default_effort(), + }; + let max_tokens = env::string("ANTHROPIC_MAX_TOKENS") .and_then(|v| v.parse().ok()) .or(client.max_tokens) - .unwrap_or(DEFAULT_MAX_TOKENS); - - // Adaptive thinking is always enabled — the model decides the budget. - let thinking = Some(ThinkingConfig::Adaptive); + .unwrap_or_else(|| default_max_tokens(effort)); let show_thinking = env::bool("OX_SHOW_THINKING") .or(tui.show_thinking) .unwrap_or(false); + // Adaptive thinking is always enabled — the model decides the + // budget. `display` opts 4.7 into streaming summarized thinking + // text (its default changed to `omitted` silently); 4.6 and + // older ignore the field. + let thinking = Some(ThinkingConfig::Adaptive { + display: show_thinking.then_some(ThinkingDisplay::Summarized), + }); + + let prompt_cache_ttl = match env::string("OX_PROMPT_CACHE_TTL") { + Some(raw) => raw + .parse::() + .context("OX_PROMPT_CACHE_TTL")?, + None => client.prompt_cache_ttl.unwrap_or(PromptCacheTtl::OneHour), + }; + Ok(Self { auth, - model, base_url, + model, + effort, max_tokens, + prompt_cache_ttl, thinking, show_thinking, }) } } +/// Per-effort `max_tokens` default. Matches claude-code 2.1.119's +/// observed values: 64 K for the top two tiers (xhigh / max), 32 K +/// for high, the legacy 16 384 for everything else. Users override +/// via `ANTHROPIC_MAX_TOKENS` / `[client].max_tokens`. +fn default_max_tokens(effort: Option) -> u32 { + match effort { + Some(Effort::Xhigh | Effort::Max) => 64_000, + Some(Effort::High) => 32_000, + _ => DEFAULT_MAX_TOKENS, + } +} + #[cfg(test)] mod tests { use std::collections::HashSet; @@ -109,9 +267,89 @@ mod tests { // ── ThinkingConfig ── #[test] - fn thinking_config_adaptive_serializes() { - let json = serde_json::to_value(&ThinkingConfig::Adaptive).unwrap(); + fn thinking_config_adaptive_without_display_serializes_bare() { + // Older models ignore `display`; absence keeps the wire as + // pre-4.7 clients expect. + let json = serde_json::to_value(&ThinkingConfig::Adaptive { display: None }).unwrap(); + assert_eq!(json["type"], "adaptive"); + assert!(json.get("display").is_none(), "display omitted: {json}"); + } + + #[test] + fn thinking_config_adaptive_with_summarized_display_serializes() { + let json = serde_json::to_value(&ThinkingConfig::Adaptive { + display: Some(ThinkingDisplay::Summarized), + }) + .unwrap(); assert_eq!(json["type"], "adaptive"); + assert_eq!(json["display"], "summarized"); + } + + // ── Effort ── + + #[test] + fn effort_round_trips_through_serde_and_fromstr() { + for (variant, token) in [ + (Effort::Low, "low"), + (Effort::Medium, "medium"), + (Effort::High, "high"), + (Effort::Xhigh, "xhigh"), + (Effort::Max, "max"), + ] { + assert_eq!(serde_json::to_value(variant).unwrap(), token); + assert_eq!(variant.to_string(), token); + assert_eq!(token.parse::().unwrap(), variant); + } + } + + #[test] + fn effort_rejects_unknown_tokens_with_actionable_error() { + let err = "extra-high".parse::().expect_err("unknown token"); + let msg = format!("{err:#}"); + assert!(msg.contains("extra-high"), "names the input: {msg}"); + for token in ["low", "medium", "high", "xhigh", "max"] { + assert!(msg.contains(token), "lists {token}: {msg}"); + } + } + + #[test] + fn effort_round_trips_through_toml_deserialize() { + #[derive(Deserialize)] + struct Wrap { + effort: Effort, + } + let wrap: Wrap = toml::from_str(r#"effort = "xhigh""#).unwrap(); + assert_eq!(wrap.effort, Effort::Xhigh); + } + + // ── PromptCacheTtl ── + + #[test] + fn prompt_cache_ttl_wire_shape() { + // 5m is the server default → field omitted. 1h opts in → "1h". + assert_eq!(PromptCacheTtl::FiveMin.wire(), None); + assert_eq!(PromptCacheTtl::OneHour.wire(), Some("1h")); + } + + #[test] + fn prompt_cache_ttl_round_trips_through_serde_and_fromstr() { + for (variant, token) in [ + (PromptCacheTtl::FiveMin, "5m"), + (PromptCacheTtl::OneHour, "1h"), + ] { + assert_eq!(serde_json::to_value(variant).unwrap(), token); + assert_eq!(variant.to_string(), token); + assert_eq!(token.parse::().unwrap(), variant); + } + } + + #[test] + fn prompt_cache_ttl_rejects_unknown_tokens_with_actionable_error() { + let err = "30m".parse::().expect_err("unknown token"); + let msg = format!("{err:#}"); + assert!(msg.contains("30m"), "{msg}"); + assert!(msg.contains("5m"), "{msg}"); + assert!(msg.contains("1h"), "{msg}"); } // ── Config::load ── @@ -125,7 +363,9 @@ mod tests { "ANTHROPIC_MODEL", "ANTHROPIC_BASE_URL", "ANTHROPIC_MAX_TOKENS", + "ANTHROPIC_EFFORT", "OX_SHOW_THINKING", + "OX_PROMPT_CACHE_TTL", "XDG_CONFIG_HOME", ]; @@ -176,13 +416,19 @@ mod tests { #[tokio::test] async fn load_defaults_apply_when_no_config_and_no_env() { + // Default model (Opus 4.7) supports `xhigh`, so both `effort` + // and `max_tokens` derive from that ceiling — matches the + // claude-code 2.1.119 packet capture. Prompt cache defaults + // to 1h (opt-out via `OX_PROMPT_CACHE_TTL=5m`). let dir = tempfile::tempdir().unwrap(); let config = temp_env::async_with_vars(env_vars(vec![xdg(&dir)]), Config::load()) .await .unwrap(); assert_eq!(config.model, DEFAULT_MODEL); assert_eq!(config.base_url, DEFAULT_BASE_URL); - assert_eq!(config.max_tokens, DEFAULT_MAX_TOKENS); + assert_eq!(config.max_tokens, 64_000); + assert_eq!(config.effort, Some(Effort::Xhigh)); + assert_eq!(config.prompt_cache_ttl, PromptCacheTtl::OneHour); assert!(!config.show_thinking); assert!(matches!(config.auth, Auth::ApiKey(k) if k == "sk-default")); } @@ -298,7 +544,10 @@ mod tests { let config = temp_env::async_with_vars(env_vars(vec![xdg(&dir)]), Config::load()) .await .unwrap(); - assert!(matches!(config.thinking, Some(ThinkingConfig::Adaptive))); + assert!(matches!( + config.thinking, + Some(ThinkingConfig::Adaptive { display: None }), + )); } #[tokio::test] @@ -345,4 +594,184 @@ mod tests { assert!(msg.contains("invalid config at"), "{msg}"); assert!(msg.contains("unknown field `show_thinking`"), "{msg}"); } + + // ── Config::load / effort resolution ── + + #[tokio::test] + async fn load_effort_default_follows_model_ceiling() { + for (model, expected) in [ + ("claude-opus-4-7", Some(Effort::Xhigh)), + ("claude-opus-4-6", Some(Effort::High)), + ("claude-sonnet-4-6", Some(Effort::High)), + ("claude-sonnet-4-5", None), + ("claude-haiku-4-5", None), + ] { + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![xdg(&dir), env("ANTHROPIC_MODEL", model)]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.effort, expected, "model={model}"); + } + } + + #[tokio::test] + async fn load_effort_env_overrides_per_model_default() { + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![ + xdg(&dir), + env("ANTHROPIC_MODEL", "claude-opus-4-7"), + env("ANTHROPIC_EFFORT", "low"), + ]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.effort, Some(Effort::Low)); + } + + #[tokio::test] + async fn load_effort_clamps_xhigh_down_to_high_on_sonnet_4_6() { + // Sonnet 4.6 supports `effort` but not `xhigh` / `max` — the + // user's pick must clamp rather than 400 the gateway. + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![ + xdg(&dir), + env("ANTHROPIC_MODEL", "claude-sonnet-4-6"), + env("ANTHROPIC_EFFORT", "xhigh"), + ]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.effort, Some(Effort::High)); + } + + #[tokio::test] + async fn load_effort_clamps_to_none_on_non_effort_capable_model() { + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![ + xdg(&dir), + env("ANTHROPIC_MODEL", "claude-haiku-4-5"), + env("ANTHROPIC_EFFORT", "max"), + ]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.effort, None); + } + + #[tokio::test] + async fn load_effort_file_picks_up_when_env_unset() { + let dir = tempfile::tempdir().unwrap(); + write_user_config( + dir.path(), + indoc::indoc! {r#" + [client] + model = "claude-opus-4-7" + effort = "medium" + "#}, + ); + let config = temp_env::async_with_vars(env_vars(vec![xdg(&dir)]), Config::load()) + .await + .unwrap(); + assert_eq!(config.effort, Some(Effort::Medium)); + } + + #[tokio::test] + async fn load_effort_env_beats_file() { + let dir = tempfile::tempdir().unwrap(); + write_user_config( + dir.path(), + indoc::indoc! {r#" + [client] + model = "claude-opus-4-7" + effort = "low" + "#}, + ); + let vars = env_vars(vec![xdg(&dir), env("ANTHROPIC_EFFORT", "max")]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.effort, Some(Effort::Max)); + } + + #[tokio::test] + async fn load_effort_invalid_env_surfaces_parse_error() { + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![xdg(&dir), env("ANTHROPIC_EFFORT", "insane")]); + let err = temp_env::async_with_vars(vars, Config::load()) + .await + .expect_err("invalid effort must propagate"); + let msg = format!("{err:#}"); + assert!(msg.contains("ANTHROPIC_EFFORT"), "{msg}"); + assert!(msg.contains("insane"), "{msg}"); + } + + // ── default_max_tokens ── + + #[test] + fn default_max_tokens_scales_with_effort() { + assert_eq!(default_max_tokens(Some(Effort::Max)), 64_000); + assert_eq!(default_max_tokens(Some(Effort::Xhigh)), 64_000); + assert_eq!(default_max_tokens(Some(Effort::High)), 32_000); + assert_eq!(default_max_tokens(Some(Effort::Medium)), DEFAULT_MAX_TOKENS); + assert_eq!(default_max_tokens(Some(Effort::Low)), DEFAULT_MAX_TOKENS); + assert_eq!(default_max_tokens(None), DEFAULT_MAX_TOKENS); + } + + // ── Config::load / prompt_cache_ttl ── + + #[tokio::test] + async fn load_prompt_cache_ttl_env_overrides_default() { + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![xdg(&dir), env("OX_PROMPT_CACHE_TTL", "5m")]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.prompt_cache_ttl, PromptCacheTtl::FiveMin); + } + + #[tokio::test] + async fn load_prompt_cache_ttl_file_picks_up_when_env_unset() { + let dir = tempfile::tempdir().unwrap(); + write_user_config( + dir.path(), + indoc::indoc! {r#" + [client] + prompt_cache_ttl = "5m" + "#}, + ); + let config = temp_env::async_with_vars(env_vars(vec![xdg(&dir)]), Config::load()) + .await + .unwrap(); + assert_eq!(config.prompt_cache_ttl, PromptCacheTtl::FiveMin); + } + + #[tokio::test] + async fn load_prompt_cache_ttl_env_beats_file() { + let dir = tempfile::tempdir().unwrap(); + write_user_config( + dir.path(), + indoc::indoc! {r#" + [client] + prompt_cache_ttl = "5m" + "#}, + ); + let vars = env_vars(vec![xdg(&dir), env("OX_PROMPT_CACHE_TTL", "1h")]); + let config = temp_env::async_with_vars(vars, Config::load()) + .await + .unwrap(); + assert_eq!(config.prompt_cache_ttl, PromptCacheTtl::OneHour); + } + + #[tokio::test] + async fn load_prompt_cache_ttl_invalid_env_surfaces_parse_error() { + let dir = tempfile::tempdir().unwrap(); + let vars = env_vars(vec![xdg(&dir), env("OX_PROMPT_CACHE_TTL", "forever")]); + let err = temp_env::async_with_vars(vars, Config::load()) + .await + .expect_err("invalid ttl must propagate"); + let msg = format!("{err:#}"); + assert!(msg.contains("OX_PROMPT_CACHE_TTL"), "{msg}"); + assert!(msg.contains("forever"), "{msg}"); + } } diff --git a/crates/oxide-code/src/config/file.rs b/crates/oxide-code/src/config/file.rs index 233dd1b..bdc45b3 100644 --- a/crates/oxide-code/src/config/file.rs +++ b/crates/oxide-code/src/config/file.rs @@ -33,13 +33,19 @@ pub(super) struct FileConfig { } /// API client settings (`[client]` section). +/// +/// Fields are grouped by concern so adjacent lines stay related: +/// connection (`api_key`, `base_url`), model selection (`model`, +/// `effort`), then request tuning (`max_tokens`, `prompt_cache_ttl`). #[derive(Debug, Default, Deserialize)] #[serde(deny_unknown_fields)] pub(super) struct ClientConfig { pub api_key: Option, - pub model: Option, pub base_url: Option, + pub model: Option, + pub effort: Option, pub max_tokens: Option, + pub prompt_cache_ttl: Option, } /// Terminal UI settings (`[tui]` section). @@ -65,9 +71,11 @@ impl ClientConfig { fn merge(self, other: Self) -> Self { Self { api_key: other.api_key.or(self.api_key), - model: other.model.or(self.model), base_url: other.base_url.or(self.base_url), + model: other.model.or(self.model), + effort: other.effort.or(self.effort), max_tokens: other.max_tokens.or(self.max_tokens), + prompt_cache_ttl: other.prompt_cache_ttl.or(self.prompt_cache_ttl), } } } @@ -186,9 +194,11 @@ mod tests { let base = FileConfig { client: Some(ClientConfig { api_key: Some("base-key".to_owned()), - model: Some("base-model".to_owned()), base_url: Some("https://base.example.com".to_owned()), + model: Some("base-model".to_owned()), + effort: Some(super::super::Effort::Low), max_tokens: Some(1000), + prompt_cache_ttl: Some(super::super::PromptCacheTtl::FiveMin), }), tui: Some(TuiConfig { show_thinking: Some(false), @@ -197,9 +207,11 @@ mod tests { let other = FileConfig { client: Some(ClientConfig { api_key: Some("other-key".to_owned()), - model: Some("other-model".to_owned()), base_url: Some("https://other.example.com".to_owned()), + model: Some("other-model".to_owned()), + effort: Some(super::super::Effort::Max), max_tokens: Some(2000), + prompt_cache_ttl: Some(super::super::PromptCacheTtl::OneHour), }), tui: Some(TuiConfig { show_thinking: Some(true), @@ -209,12 +221,17 @@ mod tests { let client = merged.client.expect("client section should be present"); assert_eq!(client.api_key.as_deref(), Some("other-key")); - assert_eq!(client.model.as_deref(), Some("other-model")); assert_eq!( client.base_url.as_deref(), Some("https://other.example.com") ); + assert_eq!(client.model.as_deref(), Some("other-model")); + assert_eq!(client.effort, Some(super::super::Effort::Max)); assert_eq!(client.max_tokens, Some(2000)); + assert_eq!( + client.prompt_cache_ttl, + Some(super::super::PromptCacheTtl::OneHour) + ); let tui = merged.tui.expect("tui section should be present"); assert_eq!(tui.show_thinking, Some(true)); @@ -225,9 +242,11 @@ mod tests { let base = FileConfig { client: Some(ClientConfig { api_key: Some("key".to_owned()), - model: Some("model".to_owned()), base_url: Some("https://example.com".to_owned()), + model: Some("model".to_owned()), + effort: Some(super::super::Effort::High), max_tokens: Some(4096), + prompt_cache_ttl: Some(super::super::PromptCacheTtl::FiveMin), }), tui: Some(TuiConfig { show_thinking: Some(true), @@ -237,9 +256,14 @@ mod tests { let client = merged.client.expect("client section should survive"); assert_eq!(client.api_key.as_deref(), Some("key")); - assert_eq!(client.model.as_deref(), Some("model")); assert_eq!(client.base_url.as_deref(), Some("https://example.com")); + assert_eq!(client.model.as_deref(), Some("model")); + assert_eq!(client.effort, Some(super::super::Effort::High)); assert_eq!(client.max_tokens, Some(4096)); + assert_eq!( + client.prompt_cache_ttl, + Some(super::super::PromptCacheTtl::FiveMin) + ); let tui = merged.tui.expect("tui section should survive"); assert_eq!(tui.show_thinking, Some(true)); diff --git a/crates/oxide-code/src/model.rs b/crates/oxide-code/src/model.rs index 3bbe421..8d21c4f 100644 --- a/crates/oxide-code/src/model.rs +++ b/crates/oxide-code/src/model.rs @@ -13,23 +13,30 @@ //! an experimental beta than 400 a request. //! //! Capability flags mirror the third-party-gateway branch of the upstream -//! `modelSupports*` predicates: +//! `modelSupports*` predicates (substring rules) and a few client-side +//! additions that come from the migration guide + live packet captures +//! (per-version allowlists): //! //! - `interleaved_thinking` ← `modelSupportsISP` — substring `opus-4` or //! `sonnet-4`. //! - `context_management` ← `modelSupportsContextManagement` — substring //! `opus-4`, `sonnet-4`, or `haiku-4`. -//! - `effort` ← `modelSupportsEffort` — substring `opus-4-6` or -//! `sonnet-4-6`. //! - `context_1m` ← `modelSupports1M` — substring `claude-sonnet-4` or //! `opus-4-6`. +//! - `effort` ← `modelSupportsEffort` — substring `opus-4-6` or +//! `sonnet-4-6`. +//! - `effort_max` — explicit allowlist: Opus 4.6, Opus 4.7. +//! - `effort_xhigh` — explicit allowlist: Opus 4.7. //! - `structured_outputs` ← `modelSupportsStructuredOutputs` — explicit //! allowlist: opus-4-1 / 4-5 / 4-6, sonnet-4-5 / 4-6, haiku-4-5. //! //! `capability_flags_match_upstream_predicates` in the test module locks //! every row to the substring predicates above so a mis-bump fails CI -//! loudly. The structured-outputs allowlist is exercised by a separate -//! enumeration test because it doesn't reduce to a substring rule. +//! loudly. Flags that are allowlist-shaped (`effort_max`, `effort_xhigh`, +//! `structured_outputs`) are exercised by per-flag enumeration tests +//! because they don't reduce to a substring rule. + +use crate::config::Effort; /// Metadata and capability flags for a single Claude model. pub(crate) struct ModelInfo { @@ -44,11 +51,11 @@ pub(crate) struct ModelInfo { } /// Per-model feature flags consulted by the API client to gate beta -/// headers. Each flag corresponds to a `modelSupports*` check in the -/// upstream reference: `interleaved_thinking` → `modelSupportsISP`, -/// `context_management` → `modelSupportsContextManagement`, `effort` → -/// `modelSupportsEffort`, `context_1m` → `modelSupports1M`, -/// `structured_outputs` → `modelSupportsStructuredOutputs`. +/// headers and body fields. `interleaved_thinking`, `context_management`, +/// `effort`, `context_1m`, and `structured_outputs` mirror upstream +/// `modelSupports*` predicates; `effort_max` and `effort_xhigh` are +/// client-side allowlists derived from the migration guide and live +/// packet captures. /// /// `context_1m` does not currently drive beta sending — that signal is /// the user-opt-in `[1m]` tag on the model string. The flag is kept for @@ -56,21 +63,27 @@ pub(crate) struct ModelInfo { /// on models that can't honor it. #[expect( clippy::struct_excessive_bools, - reason = "five independent capability flags — each maps 1:1 to a \ - separate upstream `modelSupports*` predicate; a bitflag or \ - state-machine refactor would add indirection without any \ - expressiveness gain" + reason = "seven independent capability flags — each maps 1:1 to a \ + separate upstream `modelSupports*` predicate or a \ + per-version allowlist; a bitflag or state-machine refactor \ + would add indirection without any expressiveness gain" )] #[derive(Copy, Clone, Default)] pub(crate) struct Capabilities { pub(crate) interleaved_thinking: bool, pub(crate) context_management: bool, - pub(crate) effort: bool, /// Whether the model accepts the `context-1m-2025-08-07` beta. /// `compute_betas` gates the beta on `has_1m_tag(model) AND /// context_1m` so a user who tags `claude-haiku-4[1m]` doesn't /// silently send an unsupported beta and 400. pub(crate) context_1m: bool, + /// Gates `output_config.effort` at `low` / `medium` / `high`. + /// Upper bound: see [`Self::effort_max`] / [`Self::effort_xhigh`]. + pub(crate) effort: bool, + /// Whether `effort = "max"` is accepted. Opus-only. + pub(crate) effort_max: bool, + /// Whether `effort = "xhigh"` is accepted. Opus 4.7 only. + pub(crate) effort_xhigh: bool, /// Whether the model accepts the `structured-outputs-2025-12-15` /// beta (JSON-schema-constrained text output). The upstream /// allowlist is Opus 4.1/4.5/4.6, Sonnet 4.5/4.6, Haiku 4.5; @@ -93,15 +106,15 @@ pub(crate) struct Capabilities { /// nearest sibling and flip the flags that the upstream predicate(s) /// change. /// -/// The one intentional divergence from the reference: Opus 4.7 -/// postdates the upstream snapshot we have on hand. Treating it as -/// 4.6-equivalent (all caps) reflects the monotonic-capability -/// assumption and the confirmed-in-the-wild shape of the model; -/// strictly substring-matching the upstream predicates against 4.7 -/// would wrongly deny `effort` and `1M`, both of which work. +/// The one intentional divergence from the substring-predicate rules: +/// Opus 4.7 postdates the upstream snapshot we have on hand, so it +/// inherits 4.6's monotonic-capability projection for `effort`, +/// `context_management`, and `1M`. 4.7 uniquely adds `effort_xhigh`; +/// `effort_max` is Opus-only per the migration guide (4.6 + 4.7). pub(crate) const MODELS: &[ModelInfo] = &[ - // Upstream predates 4.7; capabilities below are our 4.6-equivalent - // projection, not a reference fact. Validated empirically. + // Upstream predates 4.7; substring-derived flags inherit 4.6 as a + // monotonic projection, and `effort_xhigh` is the one 4.7-only + // addition (rejected as 400 by every other model). ModelInfo { id_substr: "claude-opus-4-7", marketing: "Claude Opus 4.7", @@ -109,8 +122,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: true, context_1m: true, + effort: true, + effort_max: true, + effort_xhigh: true, structured_outputs: true, }, }, @@ -121,8 +136,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: true, context_1m: true, + effort: true, + effort_max: true, + effort_xhigh: false, structured_outputs: true, }, }, @@ -133,8 +150,12 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: true, context_1m: true, + effort: true, + // `max` is Opus-only per the migration guide; Sonnet 4.6 + // 400s on it. + effort_max: false, + effort_xhigh: false, structured_outputs: true, }, }, @@ -145,8 +166,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: false, context_1m: false, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: true, }, }, @@ -157,8 +180,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: false, context_1m: true, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: true, }, }, @@ -173,8 +198,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ // target 3P throughout. interleaved_thinking: false, context_management: true, - effort: false, context_1m: false, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: true, }, }, @@ -185,8 +212,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: false, context_1m: false, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: true, }, }, @@ -201,8 +230,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: false, context_1m: false, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: false, }, }, @@ -216,8 +247,10 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: true, context_management: true, - effort: false, context_1m: true, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: false, }, }, @@ -228,13 +261,62 @@ pub(crate) const MODELS: &[ModelInfo] = &[ capabilities: Capabilities { interleaved_thinking: false, context_management: true, - effort: false, context_1m: false, + effort: false, + effort_max: false, + effort_xhigh: false, structured_outputs: false, }, }, ]; +impl Capabilities { + /// Whether the model accepts `output_config.effort = `. + /// Centralises the `low`/`medium`/`high` → [`Self::effort`], + /// `xhigh` → [`Self::effort_xhigh`], `max` → [`Self::effort_max`] + /// mapping so callers don't re-derive it. + fn accepts_effort(self, level: Effort) -> bool { + match level { + Effort::Low | Effort::Medium | Effort::High => self.effort, + Effort::Xhigh => self.effort_xhigh, + Effort::Max => self.effort_max, + } + } + + /// Highest level this model accepts ≤ `pick`. `None` when the + /// model doesn't accept `output_config.effort` at all. Used by + /// [`crate::config::Config::load`] to clamp an out-of-range user + /// pick down to the nearest supported level rather than 400ing + /// the gateway. + pub(crate) fn clamp_effort(self, pick: Effort) -> Option { + if !self.effort { + return None; + } + [ + Effort::Max, + Effort::Xhigh, + Effort::High, + Effort::Medium, + Effort::Low, + ] + .into_iter() + .find(|&level| level <= pick && self.accepts_effort(level)) + } + + /// Per-model default when the user hasn't specified one: `Xhigh` + /// on 4.7 (matches claude-code 2.1.119), `High` on other + /// effort-capable models, `None` otherwise. + pub(crate) fn default_effort(self) -> Option { + if self.effort_xhigh { + Some(Effort::Xhigh) + } else if self.effort { + Some(Effort::High) + } else { + None + } + } +} + /// First-match substring lookup against [`MODELS`]. Returns `None` for /// model strings that don't contain any known family stem (e.g. a future /// `claude-opus-5` before the table is bumped); callers decide whether @@ -243,6 +325,16 @@ pub(crate) fn lookup(model: &str) -> Option<&'static ModelInfo> { MODELS.iter().find(|info| model.contains(info.id_substr)) } +/// Capabilities for `model`, falling back to the all-false +/// [`Capabilities::default`] when the id doesn't match any known row. +/// Single entry point for the "unknown model → conservative defaults" +/// invariant so every call site decays the same way. +pub(crate) fn capabilities_for(model: &str) -> Capabilities { + lookup(model) + .map(|info| info.capabilities) + .unwrap_or_default() +} + #[cfg(test)] mod tests { use super::*; @@ -284,8 +376,9 @@ mod tests { // the `MODELS` table will fail here instead of silently // 400-ing one model family on a release day. // - // Structured outputs are an explicit allowlist rather than a - // substring rule, so they're covered by a separate test below. + // Allowlist-shaped flags (`effort_max`, `effort_xhigh`, + // `structured_outputs`) don't reduce to a substring rule, so + // they're covered by per-flag enumeration tests below. // // Opus 4.7 postdates the predicate set we mirror, so we skip // it here — there is no substring rule to check against. @@ -295,13 +388,13 @@ mod tests { } let m = info.id_substr; let is_opus_or_sonnet_4 = m.contains("opus-4") || m.contains("sonnet-4"); - let expect_thinking = is_opus_or_sonnet_4; // haiku-4 is not in modelSupportsISP + let expect_interleaved_thinking = is_opus_or_sonnet_4; // haiku-4 is not in modelSupportsISP let expect_context_management = is_opus_or_sonnet_4 || m.contains("haiku-4"); + let expect_context_1m = m.contains("claude-sonnet-4") || m.contains("opus-4-6"); let expect_effort = m.contains("opus-4-6") || m.contains("sonnet-4-6"); - let expect_one_million = m.contains("claude-sonnet-4") || m.contains("opus-4-6"); assert_eq!( - info.capabilities.interleaved_thinking, expect_thinking, + info.capabilities.interleaved_thinking, expect_interleaved_thinking, "{m}: interleaved_thinking should match modelSupportsISP", ); assert_eq!( @@ -309,29 +402,105 @@ mod tests { "{m}: context_management should match modelSupportsContextManagement", ); assert_eq!( - info.capabilities.effort, expect_effort, - "{m}: effort should match modelSupportsEffort", + info.capabilities.context_1m, expect_context_1m, + "{m}: context_1m should match modelSupports1M", ); assert_eq!( - info.capabilities.context_1m, expect_one_million, - "{m}: context_1m should match modelSupports1M", + info.capabilities.effort, expect_effort, + "{m}: effort should match modelSupportsEffort", ); } } #[test] - fn opus_4_7_is_treated_as_4_6_equivalent() { + fn opus_4_7_uniquely_supports_xhigh() { // Upstream predates 4.7 so its predicates wouldn't claim // `effort` or `1M` on this id_substr. We override to the // monotonic-bump projection. Pin it so a well-meaning future // edit that "aligns 4.7 with the predicates" doesn't - // accidentally strip the caps we rely on. + // accidentally strip the caps we rely on. `effort_xhigh` is + // the one 4.7-only addition — every other row must reject it. let caps = lookup("claude-opus-4-7").unwrap().capabilities; assert!(caps.interleaved_thinking); assert!(caps.context_management); - assert!(caps.effort); assert!(caps.context_1m); + assert!(caps.effort); + assert!(caps.effort_max); + assert!(caps.effort_xhigh); assert!(caps.structured_outputs); + + for other in [ + "claude-opus-4-6", + "claude-sonnet-4-6", + "claude-opus-4-5", + "claude-sonnet-4-5", + "claude-haiku-4-5", + "claude-opus-4-1", + ] { + assert!( + !lookup(other).unwrap().capabilities.effort_xhigh, + "{other} must not claim effort_xhigh — it 400s on non-4.7", + ); + } + } + + #[test] + fn effort_max_is_opus_only() { + // `max` effort is Opus-only per the migration guide. Sonnet + // 4.6 supports base `effort` but 400s on `max`; Haiku doesn't + // support `effort` at all. + for supported in ["claude-opus-4-7", "claude-opus-4-6"] { + assert!( + lookup(supported).unwrap().capabilities.effort_max, + "{supported} should claim effort_max", + ); + } + for unsupported in [ + "claude-sonnet-4-6", + "claude-opus-4-5", + "claude-sonnet-4-5", + "claude-haiku-4-5", + "claude-opus-4-1", + "claude-opus-4", + "claude-sonnet-4", + "claude-haiku-4", + ] { + assert!( + !lookup(unsupported).unwrap().capabilities.effort_max, + "{unsupported} must not claim effort_max", + ); + } + } + + // ── Capabilities::clamp_effort ── + + #[test] + fn clamp_effort_picks_highest_supported_at_or_below_user_pick() { + let opus_4_7 = lookup("claude-opus-4-7").unwrap().capabilities; + assert_eq!(opus_4_7.clamp_effort(Effort::Max), Some(Effort::Max)); + assert_eq!(opus_4_7.clamp_effort(Effort::Xhigh), Some(Effort::Xhigh)); + assert_eq!(opus_4_7.clamp_effort(Effort::Low), Some(Effort::Low)); + + // Opus 4.6: Max ✓, Xhigh ✗. `xhigh` clamps down to `high` + // (never sideways-up to `max`). + let opus_4_6 = lookup("claude-opus-4-6").unwrap().capabilities; + assert_eq!(opus_4_6.clamp_effort(Effort::Max), Some(Effort::Max)); + assert_eq!(opus_4_6.clamp_effort(Effort::Xhigh), Some(Effort::High)); + assert_eq!(opus_4_6.clamp_effort(Effort::High), Some(Effort::High)); + + // Sonnet 4.6: Max ✗, Xhigh ✗. Both clamp to `high`. + let sonnet_4_6 = lookup("claude-sonnet-4-6").unwrap().capabilities; + assert_eq!(sonnet_4_6.clamp_effort(Effort::Max), Some(Effort::High)); + assert_eq!(sonnet_4_6.clamp_effort(Effort::Xhigh), Some(Effort::High)); + assert_eq!( + sonnet_4_6.clamp_effort(Effort::Medium), + Some(Effort::Medium) + ); + + // No `effort` at all → None regardless of pick. + let haiku_4_5 = lookup("claude-haiku-4-5").unwrap().capabilities; + assert_eq!(haiku_4_5.clamp_effort(Effort::Max), None); + assert_eq!(haiku_4_5.clamp_effort(Effort::Low), None); } #[test] diff --git a/docs/guide/configuration.md b/docs/guide/configuration.md index a134790..86e83cb 100644 --- a/docs/guide/configuration.md +++ b/docs/guide/configuration.md @@ -18,8 +18,10 @@ All fields are optional. Only specify the values you want to override. [client] model = "claude-sonnet-4-6" base_url = "https://api.anthropic.com" -max_tokens = 8192 -# api_key = "sk-ant-..." # prefer the environment variable for secrets +effort = "high" +max_tokens = 32000 +prompt_cache_ttl = "1h" +# api_key = "sk-ant-..." # see Authentication below — env var is safer [tui] show_thinking = true @@ -27,12 +29,43 @@ show_thinking = true ### `[client]` — API connection -| Key | Type | Default | Description | -| ------------ | ------- | --------------------------- | ----------------------- | -| `api_key` | string | — | Anthropic API key | -| `model` | string | `claude-opus-4-7` | Model to use | -| `base_url` | string | `https://api.anthropic.com` | API base URL | -| `max_tokens` | integer | `16384` | Max tokens per response | +| Key | Type | Default | Description | +| ------------------ | ------- | --------------------------- | ----------------------------------- | +| `api_key` | string | — | Anthropic API key | +| `base_url` | string | `https://api.anthropic.com` | API base URL | +| `model` | string | `claude-opus-4-7` | Model to use | +| `effort` | string | per-model (see below) | Intelligence-vs-latency tier | +| `max_tokens` | integer | effort-derived (see below) | Max tokens per response | +| `prompt_cache_ttl` | string | `"1h"` | Prompt-cache TTL (`"5m"` or `"1h"`) | + +#### `effort` — intelligence tier + +`effort` maps 1:1 to the `output_config.effort` body field. Accepted values: `"low"`, `"medium"`, `"high"`, `"xhigh"`, `"max"`. Values above a model's per-model ceiling are silently clamped down to the highest supported level (so `"xhigh"` on Sonnet 4.6 becomes `"high"`). Models that don't accept the parameter at all (Sonnet 4.5 and older, Haiku, Opus 4.5 and older) drop it entirely from the request. + +Per-model defaults when `effort` is unset: + +| Model | Default | +| --------------- | ------- | +| Opus 4.7 | `xhigh` | +| Opus 4.6 | `high` | +| Sonnet 4.6 | `high` | +| Everything else | (unset) | + +Tier guide (from the [Opus 4.7 migration guide](https://platform.claude.com/docs/en/about-claude/models/migration-guide)): + +- `max` — deepest reasoning, Opus-only; diminishing returns on some tasks. +- `xhigh` — recommended default for coding and agentic work on Opus 4.7. +- `high` — balanced; minimum recommended for intelligence-sensitive tasks. +- `medium` — cost-sensitive workloads. +- `low` — scoped, latency-sensitive tasks. + +#### `max_tokens` — response ceiling + +When unset, oxide-code derives `max_tokens` from the resolved `effort` to match the claude-code reference: 64 000 for `xhigh` / `max`, 32 000 for `high`, 16 384 otherwise. Setting `max_tokens` explicitly (via TOML or `ANTHROPIC_MAX_TOKENS`) overrides the derivation. + +#### `prompt_cache_ttl` — cache duration + +Accepted values: `"5m"` (matches the server default as of 2026-03-06) and `"1h"` (higher write premium, bigger hit-rate win on long sessions). oxide-code defaults to `"1h"` because Anthropic's silent 2026-03 TTL drop cut typical prompt-caching savings from 80 %+ to 40-55 %. See [Agentic Request Body Fields](../research/anthropic-api.md#agentic-request-body-fields) for the wire shape and cost analysis. #### 1M context window — `[1m]` tag @@ -51,6 +84,8 @@ model = "claude-opus-4-7[1m]" | --------------- | ------- | ------- | ---------------------- | | `show_thinking` | boolean | `false` | Show extended thinking | +On Opus 4.7, `show_thinking = true` additionally opts the request into `thinking.display = "summarized"` so the API streams reasoning text; otherwise the 4.7 default (`"omitted"`) applies and the UI sees nothing until the final answer starts. + ## Authentication oxide-code checks three credential sources in order: @@ -58,21 +93,25 @@ oxide-code checks three credential sources in order: 1. `ANTHROPIC_API_KEY` environment variable. 2. `api_key` under `[client]` in a config file. 3. Claude Code OAuth credentials, if [Claude Code](https://code.claude.com/docs) is installed and signed in: - - **macOS** — the `"Claude Code-credentials"` Keychain entry (preferred), falling back to `~/.claude/.credentials.json`. - - **Linux** — `~/.claude/.credentials.json`. + - **macOS** — the `"Claude Code-credentials"` Keychain entry (preferred), falling back to `~/.claude/.credentials.json`. + - **Linux** — `~/.claude/.credentials.json`. + + Expired tokens are refreshed automatically. No configuration needed. - Expired tokens are refreshed automatically. No configuration needed. +Prefer the environment variable (or OAuth) over `api_key` in a config file. `ox.toml` is resolved by walking up from the current directory, so a project-local `ox.toml` is easy to commit by accident; a user-level `~/.config/ox/config.toml` is safer but still plaintext on disk. This matches what Claude Code itself does — Anthropic's CLI reads `ANTHROPIC_API_KEY` and otherwise keeps OAuth tokens in the macOS Keychain where it can. ## Environment variables Environment variables override all config file values. -| Variable | Config key | Default | Description | -| ---------------------- | ------------------- | --------------------------- | ----------------------- | -| `ANTHROPIC_API_KEY` | `client.api_key` | — | Anthropic API key | -| `ANTHROPIC_MODEL` | `client.model` | `claude-opus-4-7` | Model to use | -| `ANTHROPIC_BASE_URL` | `client.base_url` | `https://api.anthropic.com` | API base URL | -| `ANTHROPIC_MAX_TOKENS` | `client.max_tokens` | `16384` | Max tokens per response | -| `OX_SHOW_THINKING` | `tui.show_thinking` | `false` | Show extended thinking | +| Variable | Config key | Default | Description | +| ---------------------- | ------------------------- | --------------------------- | ---------------------------- | +| `ANTHROPIC_API_KEY` | `client.api_key` | — | Anthropic API key | +| `ANTHROPIC_BASE_URL` | `client.base_url` | `https://api.anthropic.com` | API base URL | +| `ANTHROPIC_MODEL` | `client.model` | `claude-opus-4-7` | Model to use | +| `ANTHROPIC_EFFORT` | `client.effort` | per-model | Intelligence-vs-latency tier | +| `ANTHROPIC_MAX_TOKENS` | `client.max_tokens` | effort-derived | Max tokens per response | +| `OX_PROMPT_CACHE_TTL` | `client.prompt_cache_ttl` | `1h` | Prompt-cache TTL | +| `OX_SHOW_THINKING` | `tui.show_thinking` | `false` | Show extended thinking | Set `OX_SHOW_THINKING=1` to display the model's thinking process (dimmed text) when extended thinking is enabled for the model. diff --git a/docs/research/anthropic-api.md b/docs/research/anthropic-api.md index 5c75fdc..99ac050 100644 --- a/docs/research/anthropic-api.md +++ b/docs/research/anthropic-api.md @@ -85,7 +85,8 @@ Key rules: - **Haiku one-shots** (title generation, compaction classifier) — strip agentic markers entirely. `claude-code-20250219` is re-added only when the call is agentic. - **`prompt-caching-scope` requires a 1P base URL** — the beta only matters when a block carries `cache_control.scope: "global"`, which 3P gateways reject (see [Prompt Caching Scope](#prompt-caching-scope)). oxide-code gates the header on `is_first_party_base_url()` so requests going through a proxy ship neither the scope field nor its beta. - **`context-1m` is user opt-in via `[1m]`** — appending `[1m]` to the model string (e.g., `claude-opus-4-7[1m]`) adds the 1M beta and strips the tag before the request hits the wire. Family-based auto-enable would 400 on subscriptions or gateways that don't carry 1M access. Convention matches claude-code. -- **`effort` is Opus 4.6+ and Sonnet 4.6+ only** — Opus 4.5 and older, Sonnet 4.5 and older, and all Haiku variants reject it per upstream's `modelSupportsEffort`. +- **`effort` is Opus 4.6+ and Sonnet 4.6+ only** — Opus 4.5 and older, Sonnet 4.5 and older, and all Haiku variants reject it per upstream's `modelSupportsEffort`. The per-level ceiling (`xhigh` on 4.7, `max` on Opus 4.6 / 4.7) is separately encoded in `Capabilities::effort_xhigh` / `effort_max`. +- **`effort` and `context-management` betas need a body field.** Sending the header alone is a silent no-op — the request runs at the server default. See [Agentic Request Body Fields](#agentic-request-body-fields) for the matching `output_config.effort` and `context_management.edits` shapes. oxide-code pairs each capability with both its beta and its body field so the two stay in sync. - **`structured-outputs` is per-version and caller-opt-in** — the upstream allowlist is Opus 4.1 / 4.5 / 4.6+, Sonnet 4.5 / 4.6+, Haiku 4.5. The beta ships only when a caller supplies an `output_config.format` (today: the AI-title generator). The body field and header are paired on the same capability flag: a schema passed to an unsupported model silently falls back to free-form text, mirroring the `[1m]` × `context_1m` silent-strip pattern. - **Unknown model aliases** fall through substring matching on the family stem. `claude-opus-5-x` would miss every row and ship with only the identity / caching betas; bump the `MODELS` table when a new family lands. @@ -214,6 +215,59 @@ The shape is otherwise identical in both modes: same static / dynamic section sp This matches the broader pattern of gating features like fine-grained tool streaming and client-request-ID injection on base URL rather than on the provider enum alone — the provider flag says "not Bedrock / not Vertex", but a user pointing `ANTHROPIC_BASE_URL` at a proxy still parses as first-party by that check. +## Agentic Request Body Fields + +Some capabilities live in the request body alongside (not instead of) the `anthropic-beta` header that gates them. Shipping the header but omitting the body field is a silent no-op — the feature doesn't activate. All three fields below were captured live from `claude-code 2.1.119` and cross-checked against the official migration guide. + +### `output_config.effort` + +GA as of Opus 4.6. Controls the intelligence-vs-latency tier of agentic turns via one of five tokens: `low`, `medium`, `high`, `xhigh`, `max`. + +```json +{ + "output_config": { "effort": "xhigh" } +} +``` + +- **The `effort-2025-11-24` beta header is necessary but not sufficient.** oxide-code used to send the header without the body field; the header became a no-op and the model ran at an undefined default. +- **Per-model ceiling.** `max` is Opus-only; Sonnet 4.6 400s on it. `xhigh` is Opus 4.7-only. The `Capabilities::effort_max` / `effort_xhigh` flags encode this; `Capabilities::clamp_effort` clamps a user pick down to the highest supported level at or below it. +- **Per-model default.** claude-code 2.1.119 sends `xhigh` on Opus 4.7, `high` on Opus 4.6 and Sonnet 4.6, omits the field entirely on earlier models. oxide-code mirrors this via `Capabilities::default_effort`. +- **`max_tokens` should scale with effort.** claude-code uses 64 K on Opus 4.7 at `xhigh`, 32 K on Sonnet 4.6 at `high`. oxide-code's `default_max_tokens(effort)` applies the same scaling when the user hasn't set `ANTHROPIC_MAX_TOKENS` explicitly. + +### `context_management.edits` + +Partners the `context-management-2025-06-27` beta header. claude-code ships the same directive on every 4.6+ request: + +```json +{ + "context_management": { + "edits": [{ "type": "clear_thinking_20251015", "keep": "all" }] + } +} +``` + +oxide-code applies the body-header coupling as an invariant: the body field is populated on every request whose model has `Capabilities::context_management` set, i.e. the same condition that enables the beta header. One-shot completions (the `complete` path in `client::anthropic`) skip both — matches the reference wire and keeps the title-generation path minimal. + +### `cache_control.ttl` + +Anthropic silently dropped the default ephemeral-cache TTL from 1 h to 5 m on 2026-03-06 — a 40-55 % savings regression on any session longer than 5 min. The opt-in is a body field, not a beta: + +```json +{ + "cache_control": { "type": "ephemeral", "ttl": "1h" } +} +``` + +Accepted values: `"5m"` (server default, equivalent to omitting the field) and `"1h"` (opt-in at higher write premium). No beta header is required — the field is GA. + +**oxide-code default.** `prompt_cache_ttl = "1h"`. The hit-rate recovery on real agent sessions (tool-use loops, resumed conversations) dominates the write premium, so 1 h is the right safe default. Users opt down via `[client].prompt_cache_ttl = "5m"` or `OX_PROMPT_CACHE_TTL=5m`. + +Invalidation order (from the Anthropic caching docs) is `tools → system → messages` — any change at a level busts that level and every level after it. oxide-code attaches a single `cache_control` to the static system-prompt prefix block (scope-gated on 1 P / 3 P per the previous section); the TTL rides through on both paths. + +### `thinking.display` + +See [Extended Thinking § Display modes (Opus 4.7+)](./extended-thinking.md#display-modes-opus-47). Opus 4.7 silently flipped the default to `"omitted"`; `show_thinking=true` in oxide-code opts back into `"summarized"`. + ## Third-Party Tool Restrictions As of April 4, 2026, Anthropic enforces that OAuth subscription credits (Pro / Max) are only valid for official Claude Code and claude.ai clients. Third-party tools that reuse the OAuth flow are classified as "third-party harness traffic" and must use either: @@ -267,3 +321,5 @@ oxide-code implements the same refresh flow: proactive refresh with the 5-minute - `claude-code/src/utils/secureStorage/index.ts` — platform-specific storage dispatch - `claude-code/src/utils/secureStorage/macOsKeychainStorage.ts` — macOS Keychain backend - `claude-code/src/utils/secureStorage/plainTextStorage.ts` — credential file I/O + +Body-field research is empirical rather than source-backed: the `output_config.effort`, `context_management.edits`, and `cache_control.ttl` wire shapes documented above were captured live from a `claude-code --bare -p --model claude-opus-4-7` session against a local SSE proxy on 2026-04-24 and cross-referenced with the [Opus 4.7 migration guide](https://platform.claude.com/docs/en/about-claude/models/migration-guide) and [Anthropic prompt-caching docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching). diff --git a/docs/research/extended-thinking.md b/docs/research/extended-thinking.md index afdb593..554e16d 100644 --- a/docs/research/extended-thinking.md +++ b/docs/research/extended-thinking.md @@ -43,6 +43,27 @@ When thinking is enabled, `temperature` must be omitted from the request (API re - `interleaved-thinking-2025-05-14` — enables thinking blocks interleaved with text / tool_use. - Without this header, thinking blocks appear only at the start of the response. +### Display modes (Opus 4.7+) + +Opus 4.7 adds a `thinking.display` field with two wire values: + +| Value | Meaning | +| -------------- | ------------------------------------------------------- | +| `"summarized"` | Thinking blocks stream summarized reasoning text. | +| `"omitted"` | Thinking blocks still ship but `thinking: ""` is empty. | + +**Silent default change.** On Opus 4.6, the server defaulted to `"summarized"`. On Opus 4.7 the default is `"omitted"` — any UI that renders streaming reasoning (including oxide-code's `show_thinking` TUI mode) sees a long pause followed by the final answer unless it opts back in: + +```json +{ + "thinking": { "type": "adaptive", "display": "summarized" } +} +``` + +Older models (4.6, 4.5) accept the field and ignore it, so sending it unconditionally is safe when the caller wants summarized output. oxide-code couples `display` to `config.show_thinking`: `Some(Summarized)` when the TUI is set up to render reasoning, `None` (field absent) otherwise. The `None` path preserves the pre-4.7 wire shape and lets 4.7's `omitted` default do what it says. + +No beta header gates `display` — it's GA on 4.7. + ## Thinking Block Lifecycle ### Streaming @@ -85,7 +106,7 @@ Every `thinking` block includes a `signature` field received via `signature_delt Claude Code handles credential rotation in `stripSignatureBlocks()`, which removes all thinking / redacted_thinking blocks when the active credential changes. -oxide-code implements the full thinking data pipeline: typed `Thinking`, `RedactedThinking`, and `ServerToolUse` content blocks with proper streaming accumulation, signature handling, round-trip preservation, and trailing thinking stripping with placeholder insertion. Adaptive thinking is enabled by default. Credential rotation stripping is not yet implemented (depends on Keychain OAuth support). +oxide-code implements the full thinking data pipeline: typed `Thinking`, `RedactedThinking`, and `ServerToolUse` content blocks with proper streaming accumulation, signature handling, round-trip preservation, and trailing thinking stripping with placeholder insertion. Adaptive thinking is enabled by default; `thinking.display` is set to `"summarized"` whenever the TUI's `show_thinking` flag is on (and omitted otherwise so 4.7's `"omitted"` default applies). Credential rotation stripping is not yet implemented (depends on Keychain OAuth support). ## Sources