prisma · wmadden-electric · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
@@ -38,7 +38,7 @@
     "lint:docs": "node scripts/validate-package-readmes.mjs",
     "lint:manifests": "node scripts/validate-package-manifests.mjs",
     "lint:workflows": "node scripts/lint-workflow-triggers.mjs",
-    "test:scripts": "node --test scripts/lint-workflow-triggers.test.mjs scripts/validate-skills.test.mjs scripts/determine-version-utils.test.ts scripts/check-upgrade-coverage.test.mjs scripts/set-version-utils.test.ts scripts/check-publish-deps-pn-pins.test.mjs scripts/publish-packages-utils.test.mjs scripts/check-clean-tree.test.mjs scripts/lint-casts.test.mjs scripts/sync-agent-rules.test.mjs skills-contrib/drive-diagnose-run/test/load.test.ts skills-contrib/drive-diagnose-run/test/metrics.test.ts skills-contrib/drive-diagnose-run/test/invariants.test.ts skills-contrib/drive-diagnose-run/test/cascade-brief.test.ts skills-contrib/drive-diagnose-run/test/report.test.ts skills-contrib/drive-diagnose-run/test/posthoc.test.ts skills-contrib/drive-diagnose-run/test/scorecard.test.ts skills-contrib/drive-record-traces/test/emit.test.ts skills-contrib/drive-judge-harness/test/usage.test.ts skills-contrib/drive-judge-harness/test/manifest.test.ts skills-contrib/drive-judge-harness/test/load-brief.test.ts skills-contrib/drive-judge-harness/test/run-one-brief.test.ts skills-contrib/drive-judge-harness/test/validate-parser.test.ts skills-contrib/drive-judge-harness/test/judge-model-sdk.test.ts skills-contrib/drive-judge-harness/test/rubric-correctness.test.ts skills-contrib/drive-judge-harness/test/classify-failure.test.ts skills-contrib/drive-judge-harness/test/classify-operator.test.ts skills-contrib/drive-judge-harness/test/emit-correctness.test.ts skills-contrib/drive-judge-harness/test/calibration.test.ts skills-contrib/drive-judge-harness/test/prepare-run.test.ts skills-contrib/drive-judge-harness/test/collect-run.test.ts skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts skills-contrib/drive-judge-harness/test/run-arm.test.ts",
+    "test:scripts": "node --test scripts/lint-workflow-triggers.test.mjs scripts/validate-skills.test.mjs scripts/determine-version-utils.test.ts scripts/check-upgrade-coverage.test.mjs scripts/set-version-utils.test.ts scripts/check-publish-deps-pn-pins.test.mjs scripts/publish-packages-utils.test.mjs scripts/check-clean-tree.test.mjs scripts/lint-casts.test.mjs scripts/sync-agent-rules.test.mjs skills-contrib/drive-diagnose-run/test/load.test.ts skills-contrib/drive-diagnose-run/test/metrics.test.ts skills-contrib/drive-diagnose-run/test/invariants.test.ts skills-contrib/drive-diagnose-run/test/cascade-brief.test.ts skills-contrib/drive-diagnose-run/test/report.test.ts skills-contrib/drive-diagnose-run/test/posthoc.test.ts skills-contrib/drive-diagnose-run/test/scorecard.test.ts skills-contrib/drive-record-traces/test/emit.test.ts skills-contrib/drive-judge-harness/test/usage.test.ts skills-contrib/drive-judge-harness/test/manifest.test.ts skills-contrib/drive-judge-harness/test/load-brief.test.ts skills-contrib/drive-judge-harness/test/run-one-brief.test.ts skills-contrib/drive-judge-harness/test/sdk-events.test.ts skills-contrib/drive-judge-harness/test/claude-events.test.ts skills-contrib/drive-judge-harness/test/validate-parser.test.ts skills-contrib/drive-judge-harness/test/judge-model-sdk.test.ts skills-contrib/drive-judge-harness/test/rubric-correctness.test.ts skills-contrib/drive-judge-harness/test/classify-failure.test.ts skills-contrib/drive-judge-harness/test/classify-operator.test.ts skills-contrib/drive-judge-harness/test/emit-correctness.test.ts skills-contrib/drive-judge-harness/test/calibration.test.ts skills-contrib/drive-judge-harness/test/prepare-run.test.ts skills-contrib/drive-judge-harness/test/collect-run.test.ts skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts skills-contrib/drive-judge-harness/test/run-arm.test.ts",
     "drive:diagnose": "node skills-contrib/drive-diagnose-run/cli.ts",
     "drive:emit": "node skills-contrib/drive-record-traces/emit.ts",
     "drive:run-brief": "node skills-contrib/drive-judge-harness/run-one-brief.ts",
@@ -59,6 +59,7 @@
     "prepare": "husky && skills add ./skills-contrib --skill '*' --agent universal claude-code -y && node scripts/sync-agent-rules.mjs"
   },
   "devDependencies": {
+    "@anthropic-ai/claude-agent-sdk": "^0.3.158",
     "@biomejs/biome": "2.4.15",
     "@cursor/sdk": "^1.0.15",
     "@prisma-next/tsconfig": "workspace:0.11.0",

@@ -0,0 +1,29 @@
+# Plan: claude-runtime (TML-2759)
+
+Test-first. The Claude SDK is reached only via `claude-adapter.ts`'s lazy import (mirroring `sdk-adapter.ts`); all mapping logic lives in the no-SDK `claude-events.ts` so it's unit-testable with the SDK absent. Built on branch `tml-2757-run-fidelity` (PR #657), on top of the run-fidelity commits.
+
+## Dispatches
+
+### D1 — `claude-events.ts`: pure mappers + extraction (test-first)
+- **Outcome:** Claude message/result shapes map to the harness's `RunStreamEvent` + a rich outcome, with no SDK import.
+- Implement `usageFromAssistant`, `streamEventFromMessage`, `outcomeFromResult` (→ `{status,runId,tokens,durationMs,costUsd,numTurns}`) over `unknown`. Map `cache_creation_input_tokens`→`cacheWriteTokens`, `cache_read_input_tokens`→`cacheReadTokens`; `session_id`→`runId`; `subtype==='success'`→`finished`.
+- Tests (`test/claude-events.test.ts`): real `SDKResultMessage` (success + an `error_*` subtype) + a real `assistant` message; assert token totals, `cost_usd`, `wall_clock_ms` (`duration_ms`), `num_turns`, `run_id`; degrade on non-records. SDK not installed.
+- **Builds on:** run-fidelity (`usage.ts`, the seam). **Hands to:** D2.
+
+### D2 — `claude-adapter.ts` + seam/manifest + runtime selection (test-first)
+- **Outcome:** the harness runs on Claude by default and records tokens/cost/turns; `--runtime cursor` still works.
+- `RunOutcome` gains `tokens`/`costUsd`/`numTurns` (Cursor adapter sets null). `run-one-brief.ts`: prefer `outcome.tokens` else `accumulateUsage`; populate `cost_usd`/`num_turns`/`wall_clock_ms`; runtime selection + per-runtime key gating; `defaultCreateAgent(runtime)`. `manifest.ts`: add `runtime`/`cost_usd`/`num_turns`. `run-arm.ts` + `run-one-brief.ts` CLIs: `--runtime` (default claude), `--max-budget-usd`.
+- `claude-adapter.ts`: `query()` with `cwd`/`settingSources:['project']`/`skills:'all'`/`permissionMode:'bypassPermissions'`/`allowDangerouslySkipPermissions:true`/`model`/`maxBudgetUsd`; buffer the result for `wait()`.
+- Tests: injected `createAgent` returning a Claude-shaped outcome → manifest has `runtime:'claude'`, non-null `tokens`/`cost_usd`/`num_turns`; a `--runtime cursor` selection test; key-gating per runtime.
+- **Builds on:** D1. **Hands to:** D3 (orchestrator).
+
+### D3 — install + docs + live smoke + gates + PR (orchestrator)
+- Install `@anthropic-ai/claude-agent-sdk` (`pnpm add -w -D`); handle any build-script/native hiccups as with `@cursor/sdk`.
+- Wire `test/claude-events.test.ts` into `test:scripts`.
+- Docs: SKILL.md "Runtimes" section (claude default / cursor secondary, selection, `maxBudgetUsd`); scope the token-gap note to the Cursor adapter in SKILL.md + KNOWN-ISSUES.
+- Live smoke on `claude-haiku-4-5` iff `ANTHROPIC_API_KEY` present (else gated follow-up note).
+- Gates: `pnpm test:scripts`, biome, transient-id scan. Update PR #657 title/body to "faithful + decoupled runs" (refs TML-2757 + TML-2759). Commit signed-off, push.
+- **Builds on:** D2.
+
+## Sequencing
+Serial: D1 (mappers) → D2 (adapter + wiring consume them) → D3 (install/docs/gates). Target 3 dispatches; D1+D2 delegated to one implementer, D3 by the orchestrator.
@@ -0,0 +1,88 @@
+# Slice: claude-runtime
+
+_Parent project `projects/drive-judge-harness/`. Outcome this slice contributes: the harness is **decoupled from Cursor** — it runs the Drive orchestrator on Anthropic's Claude Agent SDK by default, which reports real token usage, USD cost, and wall-clock natively (the signal `@cursor/sdk`'s local runtime never gave us). The Cursor adapter stays as a runtime-selectable secondary. Delivered alongside the run-fidelity fixes on the same branch/PR (#657)._
+
+## At a glance
+
+A live run now records tokens + dollars + wall-clock, because the runtime reports them:
+
+```jsonc
+{ "runtime": "claude", "model": "claude-haiku-4-5", "status": "finished",
+  "run_id": "<session-id>", "agent_id": null,
+  "tokens": { "inputTokens": 33, "outputTokens": 904, "cacheReadTokens": 230827, "cacheWriteTokens": 53995, "totalTokens": 285759 },
+  "cost_usd": 0.1839242, "num_turns": 9, "wall_clock_ms": 16025, "notes": [] }
+```
+
+The Cursor runtime stays available via `--runtime cursor`; its token gap (documented in the run-fidelity work) is now scoped to that adapter.
+
+## Chosen design
+
+The Cursor coupling lives in exactly one module behind a seam that already exists: `run-one-brief.ts` defines `CreateAgent` / `OrchestratorRun` / `RunOutcome`; `sdk-adapter.ts` is the only `@cursor/sdk` importer. This slice adds a **second adapter** over the same seam.
+
+Ground-truth Claude Agent SDK shapes (`@anthropic-ai/claude-agent-sdk`, confirmed from the cost-tracking + TS-reference docs):
+- `query({ prompt, options })` returns an async iterable of messages.
+- Per-`assistant` message: nested `message.usage` (`input_tokens`, `output_tokens`, `cache_creation_input_tokens`, `cache_read_input_tokens`) + `message.id`.
+- Terminal `result` message (`SDKResultMessage`): `subtype` (`success` | `error_*`), cumulative `usage` (same fields), `total_cost_usd`, `duration_ms`, `num_turns`, `session_id`, `result`.
+
+### 1. `claude-events.ts` — pure mappers (no SDK import)
+
+Mirror of `sdk-events.ts`, for the Claude shapes. Operates over `unknown`; imports nothing from the SDK so it's unit-testable with the SDK absent. Exports:
+- `usageFromAssistant(msg) -> TurnUsage | null` — maps `message.usage` (`cache_creation_input_tokens`→`cacheWriteTokens`, `cache_read_input_tokens`→`cacheReadTokens`).
+- `streamEventFromMessage(msg) -> RunStreamEvent` — `assistant` with usage → `turn-ended`; else `other`.
+- `outcomeFromResult(msg) -> { status; runId; tokens; durationMs; costUsd; numTurns } | null` — only for `type: 'result'`. `subtype === 'success'` → `finished`, else `error`; `session_id` → `runId`; cumulative `usage` → `TokenTotals`; `total_cost_usd` → `costUsd`; `duration_ms` → `durationMs`; `num_turns` → `numTurns`. Degrades on non-records.
+
+### 2. `claude-adapter.ts` — the only Claude-SDK importer (lazy)
+
+Implements `CreateAgent` over `query()`. Because `query()` is one generator (not split stream/wait), the adapter iterates it inside `stream()`, yields `turn-ended` events from per-assistant usage, captures the terminal `result` message, and returns it from `wait()` (run-one-brief drains the stream before calling `wait()`, so the result is available). `query()` options for an **unattended, skill-aware** orchestrator run:
+- `cwd: runDir` (the prepared checkout — its `.claude/skills/` are the injected bundle)
+- `settingSources: ['project']` (loads `.claude/skills/`, `.claude/agents/`, `CLAUDE.md` from the checkout)
+- `skills: 'all'` (auto-enables the `Skill` tool)
+- `permissionMode: 'bypassPermissions'` + `allowDangerouslySkipPermissions: true` (no interactive prompts)
+- `model` (the pinned model id)
+- `maxBudgetUsd` when provided (hard per-run dollar cap — aborts with `error_max_budget_usd`)
+
+### 3. Seam + manifest extensions
+
+- `RunOutcome` gains `tokens: TokenTotals | null`, `costUsd: number | null`, `numTurns: number | null` (Cursor adapter sets these `null`; tokens still flow via per-turn accumulation there).
+- `run-one-brief.ts`: prefer `outcome.tokens` when present, else fall back to `accumulateUsage(usageUpdates)`. Populate `cost_usd` / `num_turns` / `wall_clock_ms` from the outcome. The null-token note (from the run-fidelity work) fires only when tokens are genuinely null.
+- `RunManifest` gains `runtime: 'claude' | 'cursor'`, `cost_usd: number | null`, `num_turns: number | null`.
+- **Runtime selection:** `RunOneBriefConfig`/`RunArmConfig` gain `runtime: 'claude' | 'cursor'` (default `'claude'`) and optional `maxBudgetUsd`. `defaultCreateAgent(runtime)` lazily imports the matching adapter. The gate's `apiKeyPresent` is computed against the runtime's key (`ANTHROPIC_API_KEY` for claude, `CURSOR_API_KEY` for cursor). CLI gains `--runtime <claude|cursor>` (default claude) and `--max-budget-usd <n>`.
+
+## Coherence rationale
+
+One reviewer holds it in one sitting: a second adapter behind an existing seam, plus the manifest fields the new runtime can finally populate. It's entangled with the run-fidelity work on the same branch — both are "make the recorded run faithful," and this slice is what turns the token gap that work documented into a captured signal. Rolls forward as: new pure module + new adapter + additive outcome/manifest fields + a runtime selector. No production package touched.
+
+## Scope
+
+**In:** `claude-events.ts` (+ tests with real result/assistant fixtures); `claude-adapter.ts` (lazy, sole Claude-SDK importer); `RunOutcome`/`RunManifest` additions; runtime selection + key-gating + CLI flags in `run-one-brief.ts` and `run-arm.ts`; install `@anthropic-ai/claude-agent-sdk`; SKILL.md runtimes section + scope the token-gap doc to the Cursor adapter; new test wired into `test:scripts`. Delivered on branch `tml-2757-run-fidelity` / PR #657.
+
+**Out:** removing the Cursor adapter (kept as secondary, operator decision). The A/B loop / aggregation / CI gate (TML-2737). Judge calibration (TML-2736) and corpus generation (real-dollar, operator-gated).
+
+## Pre-investigated edge cases
+
+| Edge case | Disposition | Notes |
+|---|---|---|
+| `query()` is one generator, not stream+wait | Drove the adapter shape | Iterate in `stream()`, stash the `result` message for `wait()`. |
+| Claude reports cumulative usage on `result`, not just per-turn | `RunOutcome.tokens` | run-one-brief prefers outcome tokens; per-turn accumulation stays the Cursor path. |
+| No `agent_id` concept in Claude SDK | `agent_id: null`, `session_id`→`run_id` | The session id is the run identifier. |
+| Unattended run hitting a permission prompt | `bypassPermissions` + `allowDangerouslySkipPermissions` | Required for autonomous orchestrator runs. |
+| Runaway cost during calibration | `maxBudgetUsd` cap | Aborts with `error_max_budget_usd`; recorded as an error run with usage-so-far. |
+| `@anthropic-ai/claude-agent-sdk` not installed at test time | Lazy import behind the gate | `claude-events.ts` has no SDK import; tests never load the adapter. |
+
+## Slice-specific done conditions
+
+- [ ] A test feeds a real `SDKResultMessage` (success + an `error_*` subtype) through `claude-events.ts` and asserts `tokens`, `cost_usd`, `wall_clock_ms`, `num_turns`, `run_id` extraction — with the SDK not installed.
+- [ ] `--runtime cursor` still produces a Cursor-runtime manifest (selection works both ways).
+- [ ] A live smoke run on `claude-haiku-4-5` records non-null `tokens` + `cost_usd` **iff** `ANTHROPIC_API_KEY` is present; otherwise this is a gated follow-up.
+
+## Open Questions
+
+1. **Subagent token attribution.** Claude's `usage` aggregates orchestrator + subagents into one run total (per-subagent breakdown is an open SDK request). Working position: the run total is exactly what we want for the efficiency metric; per-subagent attribution is not needed for this slice.
+
+## References
+
+- Parent: `projects/drive-judge-harness/spec.md`; sibling run-fidelity slice (same branch).
+- Spike: `projects/drive-judge-harness/spikes/2026-05-31-sdk-token-usage-retrieval.md`.
+- Linear: [TML-2759](https://linear.app/prisma-company/issue/TML-2759) (related TML-2757, blocks TML-2737).
+- SDK docs: [cost-tracking](https://code.claude.com/docs/en/agent-sdk/cost-tracking), [TS reference](https://code.claude.com/docs/en/agent-sdk/typescript), [skills](https://code.claude.com/docs/en/agent-sdk/skills).
+- Seam: `skills-contrib/drive-judge-harness/{run-one-brief,sdk-adapter,sdk-events,run-arm,manifest,usage}.ts`.
@@ -0,0 +1,37 @@
+# Plan: run-fidelity (TML-2757)
+
+Test-first throughout. The live SDK is reached only via `sdk-adapter.ts`'s dynamic import; all new logic lives in no-SDK-import modules so it's unit-testable with `@cursor/sdk` absent. Spike `2026-05-31-sdk-token-usage-retrieval.md` is committed in dispatch 1.
+
+## Dispatches
+
+### D1 — `sdk-events.ts`: pure mappers + real-shape extraction (test-first)
+- **Outcome:** message/outcome mapping lives in a no-SDK module, with `agent_id` and `durationMs` extracted from the **real captured shapes**.
+- Move `extractText` / `toStreamEvent` / `adaptOutcome` (and the now-dead `extractUsage`) out of `sdk-adapter.ts` into new `sdk-events.ts` (imports nothing from the SDK; operates over `unknown`). Add `agentIdFromMessage`, `outcomeFromResult` (→ `{status,runId,durationMs}`), `streamEventFromMessage`.
+- Tests (`test/sdk-events.test.ts`): feed the real `status`/`assistant`/outcome fixtures from the spike; assert `agent_id`, `durationMs`, stream mapping. Runs with the SDK uninstalled.
+- `sdk-adapter.ts` imports the mappers (no behaviour change).
+- Commit the spike artifact here.
+- **Builds on:** merged run-setup. **Hands to:** D2.
+
+### D2 — capture agent_id + wall-clock end-to-end (test-first)
+- **Outcome:** a finished run records the real `agent_id` and `wall_clock_ms`.
+- `run-one-brief.ts`: `RunOutcome` gains `durationMs: number | null`; adapter captures `agent_id` from the first stream message carrying one and returns it from `wait()`.
+- `manifest.ts`: add `wall_clock_ms`; add the token-unavailable note when `tokens` is null on a finished live run. `run-arm.ts` threads `wall_clock_ms` into the enriched manifest.
+- Tests: outcome→manifest mapping populates `agent_id` + `wall_clock_ms`; null-token note present.
+- **Builds on:** D1. **Hands to:** D3.
+
+### D3 — `collect-run` run-scoping (test-first)
+- **Outcome:** `collectRun` returns only traces emitted during the run.
+- `prepare-run.ts`: snapshot `*.jsonl` under `runDir` after the baseline commit → `PreparedRun.preexistingTracePaths`.
+- `collect-run.ts`: exclude `preexistingTracePaths`; `agent_id` match over the remainder.
+- Tests: baseline-committed trace + run-emitted trace → only the latter returned (cover a gitignored-path trace).
+- **Builds on:** D2. **Hands to:** D4.
+
+### D4 — docs + gates + PR
+- **Outcome:** token gap documented; suite green; PR open.
+- SKILL.md / KNOWN-ISSUES: token gap (link spike) + wall-clock-as-primary note.
+- Wire new tests into `test:scripts`; run `pnpm -w typecheck`, `pnpm -w lint`, `pnpm -w test:scripts`; fix fallout.
+- Stage explicitly, sign off, push to `tml-2757-run-fidelity`, open PR (create-pr skill).
+- **Builds on:** D3.
+
+## Sequencing
+Serial: D1 unlocks testability, D2 consumes the extractors, D3 is independent of D2 but shares the manifest touch (sequence after to avoid conflict), D4 closes. Target 4 dispatches.