Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/agent", "src/llm", "src/observability"]
packages = ["src/agent", "src/evaluation", "src/llm", "src/observability"]

[project]
name = "assetopsbench-mcp"
Expand Down Expand Up @@ -42,6 +42,7 @@ wo-mcp-server = "servers.wo.main:main"
vibration-mcp-server = "servers.vibration.main:main"
openai-agent = "agent.openai_agent.cli:main"
deep-agent = "agent.deep_agent.cli:main"
evaluate = "evaluation.cli:main"


[dependency-groups]
Expand Down
34 changes: 34 additions & 0 deletions src/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Offline evaluation harness for AssetOpsBench agent runs.

Consumes saved trajectory files (written by
:func:`observability.persistence.persist_trajectory`) and scenario files
(under ``src/scenarios/``) and emits a structured JSON report combining
graded outcomes with operational metrics.

The shape mirrors conventions from SWE-bench, HELM, and τ-bench:
``run`` (executes the agent — already exists) → ``evaluate`` (this
module) → ``report.json``. Re-grading from saved trajectories is
first-class.
"""

from .models import (
AggregateOps,
EvalReport,
GradeResult,
OpsMetrics,
PersistedTrajectory,
Scenario,
ScenarioResult,
TypeBreakdown,
)

__all__ = [
"AggregateOps",
"EvalReport",
"GradeResult",
"OpsMetrics",
"PersistedTrajectory",
"Scenario",
"ScenarioResult",
"TypeBreakdown",
]
106 changes: 106 additions & 0 deletions src/evaluation/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""``uv run evaluate`` — offline grading + report generation."""

from __future__ import annotations

import argparse
import logging
import sys
from pathlib import Path

from . import graders as grader_registry
from .report import render_summary, write_report
from .runner import evaluate


def _build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="evaluate",
description=(
"Grade saved agent trajectories against scenario files and "
"emit a JSON report."
),
)
p.add_argument(
"--trajectories",
type=Path,
required=True,
help="Directory of {run_id}.json trajectory files (or a single file).",
)
p.add_argument(
"--scenarios",
type=Path,
nargs="+",
required=True,
help="One or more scenario JSON / JSONL files.",
)
p.add_argument(
"--output",
type=Path,
required=True,
help="Path to write the JSON report.",
)
p.add_argument(
"--grader-default",
default="llm_judge",
help="Grader name when scenario.grading_method is unset. "
"Default: llm_judge.",
)
p.add_argument(
"--judge-model",
default=None,
help="Model id for the LLM judge (e.g. "
"litellm_proxy/anthropic/claude-opus-4-5). "
"Required when any scenario routes to llm_judge.",
)
p.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable INFO-level logging.",
)
return p


def _maybe_install_judge(judge_model: str | None) -> None:
if not judge_model:
return
# Imported lazily so the CLI works for deterministic-only runs even
# if the LiteLLM dep happens to be flaky in the dev environment.
from llm import LiteLLMBackend # type: ignore[import-not-found]

from .graders.llm_judge import install

install(LiteLLMBackend(model=judge_model))


def _validate_grader_default(name: str) -> None:
try:
grader_registry.get(name)
except KeyError as exc:
raise SystemExit(str(exc))


def main(argv: list[str] | None = None) -> int:
args = _build_parser().parse_args(argv)
logging.basicConfig(
level=logging.INFO if args.verbose else logging.WARNING,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)

_maybe_install_judge(args.judge_model)
_validate_grader_default(args.grader_default)

report = evaluate(
trajectories_path=args.trajectories,
scenarios_paths=list(args.scenarios),
default_grading_method=args.grader_default,
)

out = write_report(report, args.output)
print(render_summary(report))
print(f"\nReport written: {out}")
return 0


if __name__ == "__main__":
sys.exit(main())
36 changes: 36 additions & 0 deletions src/evaluation/graders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Pluggable grader registry.

Each grader is a callable taking ``(scenario, answer, trajectory_text)``
and returning a :class:`~evaluation.models.GradeResult`. Registration
happens via :func:`register`; the CLI looks up graders by name from
``scenario.grading_method`` (falling back to a CLI-supplied default).
"""

from __future__ import annotations

from typing import Callable

from ..models import GradeResult, Scenario

Grader = Callable[[Scenario, str, str], GradeResult]

_REGISTRY: dict[str, Grader] = {}


def register(name: str, grader: Grader) -> None:
_REGISTRY[name] = grader


def get(name: str) -> Grader:
if name not in _REGISTRY:
raise KeyError(
f"unknown grader {name!r}; registered: {sorted(_REGISTRY)}"
)
return _REGISTRY[name]


def names() -> list[str]:
return sorted(_REGISTRY)


from . import deterministic # noqa: E402,F401 — register-on-import
71 changes: 71 additions & 0 deletions src/evaluation/graders/deterministic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Pure deterministic graders — no LLM, no network."""

from __future__ import annotations

import math

from ..models import GradeResult, Scenario
from . import register


def exact_string_match(
scenario: Scenario, answer: str, trajectory_text: str
) -> GradeResult:
expected = scenario.expected_answer
if expected is None:
return GradeResult(
grading_method="exact_string_match",
passed=False,
score=0.0,
rationale="scenario has no expected_answer",
)

a = str(answer).strip().lower()
e = str(expected).strip().lower()
passed = a == e
return GradeResult(
grading_method="exact_string_match",
passed=passed,
score=1.0 if passed else 0.0,
rationale="" if passed else f"expected {expected!r}, got {answer!r}",
details={"expected": expected, "actual": answer},
)


def numeric_match(
scenario: Scenario, answer: str, trajectory_text: str
) -> GradeResult:
expected_raw = scenario.expected_answer
extra = scenario.model_extra or {}
tolerance = float(extra.get("tolerance", 1e-6))

if expected_raw is None:
return GradeResult(
grading_method="numeric_match",
passed=False,
rationale="scenario has no expected_answer",
)

try:
a = float(answer)
e = float(expected_raw)
except (TypeError, ValueError) as err:
return GradeResult(
grading_method="numeric_match",
passed=False,
rationale=f"could not parse numbers: {err}",
details={"expected": expected_raw, "actual": answer},
)

passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance)
return GradeResult(
grading_method="numeric_match",
passed=passed,
score=1.0 if passed else 0.0,
rationale="" if passed else f"|{a} - {e}| > tol={tolerance}",
details={"expected": e, "actual": a, "tolerance": tolerance},
)


register("exact_string_match", exact_string_match)
register("numeric_match", numeric_match)
Loading