IBM · ShuxinLin · Apr 27, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/agent", "src/llm", "src/observability"]
+packages = ["src/agent", "src/evaluation", "src/llm", "src/observability"]
 
 [project]
 name = "assetopsbench-mcp"
@@ -42,6 +42,7 @@ wo-mcp-server = "servers.wo.main:main"
 vibration-mcp-server = "servers.vibration.main:main"
 openai-agent = "agent.openai_agent.cli:main"
 deep-agent = "agent.deep_agent.cli:main"
+evaluate = "evaluation.cli:main"
 
 
 [dependency-groups]

diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py
@@ -0,0 +1,34 @@
+"""Offline evaluation harness for AssetOpsBench agent runs.
+
+Consumes saved trajectory files (written by
+:func:`observability.persistence.persist_trajectory`) and scenario files
+(under ``src/scenarios/``) and emits a structured JSON report combining
+graded outcomes with operational metrics.
+
+The shape mirrors conventions from SWE-bench, HELM, and τ-bench:
+``run`` (executes the agent — already exists) → ``evaluate`` (this
+module) → ``report.json``.  Re-grading from saved trajectories is
+first-class.
+"""
+
+from .models import (
+    AggregateOps,
+    EvalReport,
+    GradeResult,
+    OpsMetrics,
+    PersistedTrajectory,
+    Scenario,
+    ScenarioResult,
+    TypeBreakdown,
+)
+
+__all__ = [
+    "AggregateOps",
+    "EvalReport",
+    "GradeResult",
+    "OpsMetrics",
+    "PersistedTrajectory",
+    "Scenario",
+    "ScenarioResult",
+    "TypeBreakdown",
+]
diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
@@ -0,0 +1,106 @@
+"""``uv run evaluate`` — offline grading + report generation."""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from . import graders as grader_registry
+from .report import render_summary, write_report
+from .runner import evaluate
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="evaluate",
+        description=(
+            "Grade saved agent trajectories against scenario files and "
+            "emit a JSON report."
+        ),
+    )
+    p.add_argument(
+        "--trajectories",
+        type=Path,
+        required=True,
+        help="Directory of {run_id}.json trajectory files (or a single file).",
+    )
+    p.add_argument(
+        "--scenarios",
+        type=Path,
+        nargs="+",
+        required=True,
+        help="One or more scenario JSON / JSONL files.",
+    )
+    p.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Path to write the JSON report.",
+    )
+    p.add_argument(
+        "--grader-default",
+        default="llm_judge",
+        help="Grader name when scenario.grading_method is unset. "
+        "Default: llm_judge.",
+    )
+    p.add_argument(
+        "--judge-model",
+        default=None,
+        help="Model id for the LLM judge (e.g. "
+        "litellm_proxy/anthropic/claude-opus-4-5). "
+        "Required when any scenario routes to llm_judge.",
+    )
+    p.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable INFO-level logging.",
+    )
+    return p
+
+
+def _maybe_install_judge(judge_model: str | None) -> None:
+    if not judge_model:
+        return
+    # Imported lazily so the CLI works for deterministic-only runs even
+    # if the LiteLLM dep happens to be flaky in the dev environment.
+    from llm import LiteLLMBackend  # type: ignore[import-not-found]
+
+    from .graders.llm_judge import install
+
+    install(LiteLLMBackend(model=judge_model))
+
+
+def _validate_grader_default(name: str) -> None:
+    try:
+        grader_registry.get(name)
+    except KeyError as exc:
+        raise SystemExit(str(exc))
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_parser().parse_args(argv)
+    logging.basicConfig(
+        level=logging.INFO if args.verbose else logging.WARNING,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+
+    _maybe_install_judge(args.judge_model)
+    _validate_grader_default(args.grader_default)
+
+    report = evaluate(
+        trajectories_path=args.trajectories,
+        scenarios_paths=list(args.scenarios),
+        default_grading_method=args.grader_default,
+    )
+
+    out = write_report(report, args.output)
+    print(render_summary(report))
+    print(f"\nReport written: {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/evaluation/graders/__init__.py b/src/evaluation/graders/__init__.py
@@ -0,0 +1,36 @@
+"""Pluggable grader registry.
+
+Each grader is a callable taking ``(scenario, answer, trajectory_text)``
+and returning a :class:`~evaluation.models.GradeResult`.  Registration
+happens via :func:`register`; the CLI looks up graders by name from
+``scenario.grading_method`` (falling back to a CLI-supplied default).
+"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+from ..models import GradeResult, Scenario
+
+Grader = Callable[[Scenario, str, str], GradeResult]
+
+_REGISTRY: dict[str, Grader] = {}
+
+
+def register(name: str, grader: Grader) -> None:
+    _REGISTRY[name] = grader
+
+
+def get(name: str) -> Grader:
+    if name not in _REGISTRY:
+        raise KeyError(
+            f"unknown grader {name!r}; registered: {sorted(_REGISTRY)}"
+        )
+    return _REGISTRY[name]
+
+
+def names() -> list[str]:
+    return sorted(_REGISTRY)
+
+
+from . import deterministic  # noqa: E402,F401  — register-on-import
diff --git a/src/evaluation/graders/deterministic.py b/src/evaluation/graders/deterministic.py
@@ -0,0 +1,71 @@
+"""Pure deterministic graders — no LLM, no network."""
+
+from __future__ import annotations
+
+import math
+
+from ..models import GradeResult, Scenario
+from . import register
+
+
+def exact_string_match(
+    scenario: Scenario, answer: str, trajectory_text: str
+) -> GradeResult:
+    expected = scenario.expected_answer
+    if expected is None:
+        return GradeResult(
+            grading_method="exact_string_match",
+            passed=False,
+            score=0.0,
+            rationale="scenario has no expected_answer",
+        )
+
+    a = str(answer).strip().lower()
+    e = str(expected).strip().lower()
+    passed = a == e
+    return GradeResult(
+        grading_method="exact_string_match",
+        passed=passed,
+        score=1.0 if passed else 0.0,
+        rationale="" if passed else f"expected {expected!r}, got {answer!r}",
+        details={"expected": expected, "actual": answer},
+    )
+
+
+def numeric_match(
+    scenario: Scenario, answer: str, trajectory_text: str
+) -> GradeResult:
+    expected_raw = scenario.expected_answer
+    extra = scenario.model_extra or {}
+    tolerance = float(extra.get("tolerance", 1e-6))
+
+    if expected_raw is None:
+        return GradeResult(
+            grading_method="numeric_match",
+            passed=False,
+            rationale="scenario has no expected_answer",
+        )
+
+    try:
+        a = float(answer)
+        e = float(expected_raw)
+    except (TypeError, ValueError) as err:
+        return GradeResult(
+            grading_method="numeric_match",
+            passed=False,
+            rationale=f"could not parse numbers: {err}",
+            details={"expected": expected_raw, "actual": answer},
+        )
+
+    passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance)
+    return GradeResult(
+        grading_method="numeric_match",
+        passed=passed,
+        score=1.0 if passed else 0.0,
+        rationale="" if passed else f"|{a} - {e}| > tol={tolerance}",
+        details={"expected": e, "actual": a, "tolerance": tolerance},
+    )
+
+
+register("exact_string_match", exact_string_match)
+register("numeric_match", numeric_match)