diff --git a/plugins/copilot/README.md b/plugins/copilot/README.md index c5a32c78..c51625f8 100644 --- a/plugins/copilot/README.md +++ b/plugins/copilot/README.md @@ -45,8 +45,17 @@ Ask Copilot things like *"run the sleep cycle"*, *"what did the last sleep propose?"*, *"adopt the staged sleep proposal"*. Copilot calls the MCP tools: `sleep_status`, `sleep_dry_run`, `sleep_run`, `sleep_adopt`, `sleep_harvest`. -Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`), and -`scope` arguments. Default backend is `mock` (no API spend). +Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`/`copilot`), and +`scope` arguments. Default backend is `mock` (no API spend). The `copilot` +backend drives the GitHub Copilot CLI (`copilot -p ... --output-format json`) +and requires the `copilot` CLI to be installed and authenticated. + +For speed, the `copilot` backend runs each call against an isolated +`COPILOT_HOME` with built-in MCP servers and custom instructions disabled, so +your user MCP servers (including this project's own) are not spawned per call +(~5x faster). Override with `SKILLOPT_SLEEP_COPILOT_HOME=`, pick a model +with `SKILLOPT_SLEEP_COPILOT_MODEL`, or set `SKILLOPT_SLEEP_COPILOT_FULL_ENV=1` +to use your real Copilot environment instead. ## Verify the server directly (no Copilot needed) diff --git a/plugins/copilot/mcp_server.py b/plugins/copilot/mcp_server.py index d03a95b6..2c592aea 100755 --- a/plugins/copilot/mcp_server.py +++ b/plugins/copilot/mcp_server.py @@ -45,8 +45,8 @@ "type": "object", "properties": { "project": {"type": "string", "description": "Project dir to evolve (default: cwd)."}, - "backend": {"type": "string", "enum": ["mock", "claude", "codex"], - "description": "mock = no API spend (default); claude/codex = real."}, + "backend": {"type": "string", "enum": ["mock", "claude", "codex", "copilot"], + "description": "mock = no API spend (default); claude/codex/copilot = real."}, "scope": {"type": "string", "enum": ["invoked", "all"]}, }, "additionalProperties": False, diff --git a/plugins/copilot/skillopt/README.md b/plugins/copilot/skillopt/README.md new file mode 100644 index 00000000..c4910a23 --- /dev/null +++ b/plugins/copilot/skillopt/README.md @@ -0,0 +1,98 @@ +# SkillOpt — GitHub Copilot integration + +Give **Copilot** (CLI or VS Code) direct access to the **SkillOpt** research +engine via a tiny **MCP server**. MCP is GitHub's supported way to extend +Copilot, so this works across Copilot CLI, VS Code, and other MCP clients with +the same server. + +SkillOpt is **validation-gated, text-space skill optimization**: it reflects on +rollouts, makes bounded edits to a skill, and keeps a change only if it improves +a held-out validation set. This plugin exposes the repo's training and eval +entry points (`scripts/train.py`, `scripts/eval_only.py`) as Copilot tools. + +> This is the companion to the **SkillOpt-Sleep** plugin (`../mcp_server.py`, +> `sleep_*` tools). Sleep evolves a *local coding agent* from your past +> sessions; this server drives the *research* training/eval loops on the +> benchmark configs in [`../../../configs`](../../../configs). + +## What's here + +| File | Purpose | +|---|---| +| `mcp_server.py` | stdlib-only MCP (stdio) server exposing `skillopt_*` tools | +| `mcp-config.example.json` | drop-in MCP server config | +| `copilot-instructions.snippet.md` | paste into `.github/copilot-instructions.md` | + +## Install + +Requires Python ≥ 3.10. The MCP server itself is pure stdlib, but the tools it +launches need SkillOpt's runtime deps — install the package first: + +```bash +pip install -e . # or: pip install -r requirements.txt +``` + +1. **Register the MCP server.** Add the server to your Copilot MCP config + (Copilot CLI: `~/.copilot/mcp-config.json`; VS Code: your MCP settings). + Use `mcp-config.example.json` as a template — set `SKILLOPT_REPO` to this + repo's path: + + ```json + { + "mcpServers": { + "skillopt": { + "command": "python3", + "args": ["/abs/path/SkillOpt/plugins/copilot/skillopt/mcp_server.py"], + "env": { "SKILLOPT_REPO": "/abs/path/SkillOpt" } + } + } + } + ``` + +2. **(Optional) Tell Copilot about it.** Append + `copilot-instructions.snippet.md` to your repo's + `.github/copilot-instructions.md` so Copilot reaches for the tools when the + user asks to "optimize a skill" or "train on a benchmark". + +## Use + +Ask Copilot things like *"what configs can I run?"*, *"optimize the searchqa +skill"*, or *"evaluate this skill on the dataset"*. Copilot calls the MCP tools: +`skillopt_list_configs`, `skillopt_train`, `skillopt_eval`. + +| Tool | Required args | Notes | +|---|---|---| +| `skillopt_list_configs` | — | Lists `configs/**/*.yaml` you can pass as `config`. | +| `skillopt_train` | `config` | Runs a reflective optimization loop. Long-running; spends budget. | +| `skillopt_eval` | `config`, `skill` | Evaluates one skill markdown file; no training. | + +Common optional args (both train and eval): `env`, `backend`, +`optimizer_model`, `target_model`, `out_root`, `cfg_options` (space-separated +`KEY=VALUE` YAML overrides), and `extra_args` (raw passthrough flags for the +underlying script). `skillopt_train` also accepts `num_epochs`, `batch_size`, +`seed`, and `use_gate`. + +Runs can be very long. The server's subprocess timeout defaults to 6 hours; +override it with the `SKILLOPT_RUN_TIMEOUT` environment variable (seconds). + +## Verify the server directly (no Copilot needed) + +```bash +printf '%s\n' \ + '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' \ + '{"jsonrpc":"2.0","id":2,"method":"tools/list"}' \ + '{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"skillopt_list_configs","arguments":{}}}' \ + | SKILLOPT_REPO="$(pwd)" python3 plugins/copilot/skillopt/mcp_server.py +``` + +You should see the server info, the three `skillopt_*` tools, and the list of +benchmark configs. + +## Notes / status + +- MCP is the stable, official Copilot extension surface, so this is portable + across Copilot CLI and IDE from one server. +- `skillopt_list_configs` is filesystem-only and safe to call anytime; + `skillopt_train` / `skillopt_eval` shell out to the repo scripts and require + the SkillOpt runtime deps (and, for real backends, model credentials — see + [`../../../.env.example`](../../../.env.example)). diff --git a/plugins/copilot/skillopt/copilot-instructions.snippet.md b/plugins/copilot/skillopt/copilot-instructions.snippet.md new file mode 100644 index 00000000..b53c4a5d --- /dev/null +++ b/plugins/copilot/skillopt/copilot-instructions.snippet.md @@ -0,0 +1,33 @@ + + +## SkillOpt (research skill-optimization engine) + +This repo exposes the core **SkillOpt** training/eval engine via an MCP server +(`skillopt`). SkillOpt is validation-gated, text-space skill optimization: it +reflects on rollouts, makes bounded edits to a skill, and keeps a change only +if it improves a held-out validation set. + +When the user asks to "optimize a skill", "train on ", "run +SkillOpt", "evaluate this skill", or "what configs can I run", use the MCP +tools: + +- `skillopt_list_configs` — list the benchmark YAML configs you can pass as `config` +- `skillopt_train` — run a reflective skill-optimization loop on a config (long-running; spends API/compute budget) +- `skillopt_eval` — evaluate a single skill markdown file on a dataset (no training) + +Guidance: +- Always run `skillopt_list_configs` first if you don't already know a valid `config` path. +- `skillopt_train` and `skillopt_eval` are long-running and consume the user's + model backend/budget — confirm the `config`, `backend`, and model choices + with the user before launching, and surface the held-out gate result when the + run finishes. +- For one-off YAML overrides use `cfg_options` (e.g. `seed=123 batch_size=40`); + for any other underlying flag use `extra_args`. + +This is distinct from the **SkillOpt-Sleep** MCP server (`skillopt-sleep`, +`sleep_*` tools), which evolves a local coding agent from past sessions rather +than running the research benchmarks. diff --git a/plugins/copilot/skillopt/mcp-config.example.json b/plugins/copilot/skillopt/mcp-config.example.json new file mode 100644 index 00000000..eb2aba55 --- /dev/null +++ b/plugins/copilot/skillopt/mcp-config.example.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "skillopt": { + "command": "python3", + "args": ["plugins/copilot/skillopt/mcp_server.py"], + "env": { + "SKILLOPT_REPO": "${workspaceFolder}" + } + } + } +} diff --git a/plugins/copilot/skillopt/mcp_server.py b/plugins/copilot/skillopt/mcp_server.py new file mode 100644 index 00000000..853877fd --- /dev/null +++ b/plugins/copilot/skillopt/mcp_server.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +"""SkillOpt (research engine) — minimal MCP server (stdio, stdlib-only). + +Exposes the core SkillOpt skill-optimization engine as MCP tools so any +MCP-capable client (GitHub Copilot CLI / VS Code, Claude Desktop, etc.) can +drive it. No third-party deps: speaks JSON-RPC 2.0 over stdio with just the +handful of MCP methods clients need. + +This is the companion to the SkillOpt-Sleep MCP server (``../mcp_server.py``). +Where Sleep evolves a *local agent* from past sessions, this server drives the +*research* training/eval loops from this repo (``scripts/train.py`` / +``scripts/eval_only.py``) against the benchmark configs in ``configs/``. + +Tools exposed: + - skillopt_list_configs : discover the benchmark YAML configs you can use + - skillopt_train : run a reflective skill-optimization (training) loop + - skillopt_eval : evaluate a single skill on a dataset (no training) + +``skillopt_train`` and ``skillopt_eval`` shell out to the repo's entry-point +scripts and stream back their stdout/stderr. Configure your client to launch: + python plugins/copilot/skillopt/mcp_server.py +""" +from __future__ import annotations + +import glob +import json +import os +import subprocess +import sys + +# Repo root: three levels up from plugins/copilot/skillopt/mcp_server.py +REPO_ROOT = os.environ.get("SKILLOPT_REPO") or os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..") +) +PROTOCOL_VERSION = "2024-11-05" + +# Training/eval runs are long; give the engine plenty of headroom. +RUN_TIMEOUT_SECONDS = int(os.environ.get("SKILLOPT_RUN_TIMEOUT", "21600")) # 6h + + +def _list_configs() -> str: + """List the benchmark configs available under configs/ (filesystem only).""" + pattern = os.path.join(REPO_ROOT, "configs", "**", "*.yaml") + paths = sorted(glob.glob(pattern, recursive=True)) + if not paths: + return f"[no configs found under {os.path.join(REPO_ROOT, 'configs')}]" + rels = [os.path.relpath(p, REPO_ROOT).replace(os.sep, "/") for p in paths] + lines = ["Available SkillOpt configs (pass as `config`):", ""] + lines += [f" - {r}" for r in rels] + return "\n".join(lines) + + +def _run_script(script_rel: str, args: dict, *, required: tuple[str, ...] = ()) -> str: + """Shell out to a repo entry-point script, mapping args -> --flags.""" + for key in required: + if not args.get(key): + return f"[error] missing required argument: {key}" + + py = sys.executable or "python3" + cmd = [py, os.path.join("scripts", script_rel)] + + # Ordered flags that the train/eval scripts accept directly. + flag_args = ( + "config", "skill", "split", "env", "backend", + "optimizer_model", "target_model", "out_root", + "num_epochs", "batch_size", "seed", "use_gate", + ) + for key in flag_args: + val = args.get(key) + if val is None or val == "": + continue + cmd += [f"--{key}", str(val)] + + # cfg-options: arbitrary KEY=VALUE YAML overrides (nargs="+"). + cfg_options = args.get("cfg_options") + if cfg_options: + if isinstance(cfg_options, str): + cfg_options = cfg_options.split() + cmd += ["--cfg-options", *[str(x) for x in cfg_options]] + + # extra_args: raw passthrough for any other train/eval flag. + extra = args.get("extra_args") + if extra: + if isinstance(extra, str): + extra = extra.split() + cmd += [str(x) for x in extra] + + try: + proc = subprocess.run( + cmd, cwd=REPO_ROOT, capture_output=True, text=True, + timeout=RUN_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired: + return f"[error] run exceeded {RUN_TIMEOUT_SECONDS}s timeout: {' '.join(cmd)}" + except Exception as e: # noqa: BLE001 + return f"[error] failed to run script: {e}" + out = (proc.stdout or "").strip() + err = (proc.stderr or "").strip() + body = out + (("\n[stderr]\n" + err) if err else "") + return body or f"[done] exit code {proc.returncode}, no output" + + +TOOLS = [ + { + "name": "skillopt_list_configs", + "description": "List the benchmark YAML configs under configs/ that can be passed as `config` to train/eval.", + }, + { + "name": "skillopt_train", + "description": "Run a SkillOpt reflective skill-optimization (training) loop on a benchmark config. Long-running; uses your model backend/budget.", + }, + { + "name": "skillopt_eval", + "description": "Evaluate a single skill markdown file on a dataset without training (scripts/eval_only.py).", + }, +] +_BY_NAME = {t["name"]: t for t in TOOLS} + +_NO_ARGS_SCHEMA = {"type": "object", "properties": {}, "additionalProperties": False} + +_COMMON_PROPS = { + "config": {"type": "string", + "description": "Path to a benchmark YAML config (e.g. configs/searchqa/default.yaml). See skillopt_list_configs."}, + "env": {"type": "string", "description": "Override the environment/adapter name (e.g. searchqa, alfworld)."}, + "backend": {"type": "string", "description": "Model backend (e.g. azure_openai, claude, codex, qwen, minimax)."}, + "optimizer_model": {"type": "string", "description": "Model used for reflection/skill rewriting (the optimizer)."}, + "target_model": {"type": "string", "description": "Model used to execute tasks (the target)."}, + "out_root": {"type": "string", "description": "Output directory root for run artifacts."}, + "cfg_options": {"type": "string", "description": "Space-separated YAML overrides, e.g. 'seed=123 batch_size=40'."}, + "extra_args": {"type": "string", "description": "Raw passthrough flags for the underlying script, e.g. '--workers 8 --max_turns 30'."}, +} + +_TRAIN_SCHEMA = { + "type": "object", + "properties": { + **_COMMON_PROPS, + "num_epochs": {"type": "integer", "description": "Number of optimization epochs."}, + "batch_size": {"type": "integer", "description": "Tasks per optimization step."}, + "seed": {"type": "integer", "description": "Random seed."}, + "use_gate": {"type": "string", "enum": ["true", "false"], + "description": "Whether to keep the held-out validation gate on (default on)."}, + }, + "required": ["config"], + "additionalProperties": False, +} + +_EVAL_SCHEMA = { + "type": "object", + "properties": { + **_COMMON_PROPS, + "skill": {"type": "string", "description": "Path to the skill markdown file to evaluate."}, + "split": {"type": "string", "description": "Dataset split to evaluate (default: all)."}, + }, + "required": ["config", "skill"], + "additionalProperties": False, +} + +_SCHEMA_BY_NAME = { + "skillopt_list_configs": _NO_ARGS_SCHEMA, + "skillopt_train": _TRAIN_SCHEMA, + "skillopt_eval": _EVAL_SCHEMA, +} + + +def _result(id_, result): + return {"jsonrpc": "2.0", "id": id_, "result": result} + + +def _error(id_, code, message): + return {"jsonrpc": "2.0", "id": id_, "error": {"code": code, "message": message}} + + +def _dispatch(name: str, args: dict) -> str: + if name == "skillopt_list_configs": + return _list_configs() + if name == "skillopt_train": + return _run_script("train.py", args, required=("config",)) + if name == "skillopt_eval": + return _run_script("eval_only.py", args, required=("config", "skill")) + return f"[error] unknown tool: {name}" + + +def handle(req: dict): + method = req.get("method") + id_ = req.get("id") + if method == "initialize": + return _result(id_, { + "protocolVersion": PROTOCOL_VERSION, + "capabilities": {"tools": {}}, + "serverInfo": {"name": "skillopt", "version": "0.1.0"}, + }) + if method in ("notifications/initialized", "initialized"): + return None # notification, no response + if method == "tools/list": + return _result(id_, {"tools": [ + {"name": t["name"], "description": t["description"], + "inputSchema": _SCHEMA_BY_NAME[t["name"]]} + for t in TOOLS + ]}) + if method == "tools/call": + params = req.get("params") or {} + name = params.get("name") + if name not in _BY_NAME: + return _error(id_, -32602, f"unknown tool: {name}") + text = _dispatch(name, params.get("arguments") or {}) + return _result(id_, {"content": [{"type": "text", "text": text}]}) + if method == "ping": + return _result(id_, {}) + return _error(id_, -32601, f"method not found: {method}") + + +def main() -> int: + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + req = json.loads(line) + except Exception: + continue + resp = handle(req) + if resp is not None: + sys.stdout.write(json.dumps(resp) + "\n") + sys.stdout.flush() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skillopt_sleep/__main__.py b/skillopt_sleep/__main__.py index f2efa3e9..440bf85d 100644 --- a/skillopt_sleep/__main__.py +++ b/skillopt_sleep/__main__.py @@ -9,7 +9,7 @@ Common flags: --project PATH project to evolve (default: cwd) --scope all|invoked harvest scope (default: invoked) - --backend mock|anthropic + --backend mock|claude|codex|copilot --model NAME --lookback-hours N --auto-adopt @@ -34,7 +34,7 @@ def _add_common(p: argparse.ArgumentParser) -> None: p.add_argument("--project", default="") p.add_argument("--scope", default="", choices=["", "all", "invoked"]) - p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex"]) + p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex", "copilot"]) p.add_argument("--model", default="") p.add_argument("--codex-path", default="", help="path to the real @openai/codex binary") p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)") diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py index fbc8d269..6a22ca4d 100644 --- a/skillopt_sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -24,6 +24,7 @@ import os import re import subprocess +import tempfile from typing import Any, Dict, List, Optional, Tuple from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord @@ -698,6 +699,218 @@ def attempt_with_tools(self, task, skill, memory, tools): except Exception: pass +def resolve_copilot_path(explicit: str = "") -> str: + """Find the GitHub Copilot CLI (`copilot`) binary.""" + if explicit: + return explicit + env = os.environ.get("SKILLOPT_SLEEP_COPILOT_PATH") + if env: + return env + import shutil + found = shutil.which("copilot") + return found or "copilot" + + +class CopilotCliBackend(CliBackend): + """Drives the GitHub Copilot CLI in non-interactive mode. + + Uses ``copilot -p --output-format json`` and parses the emitted + JSONL event stream, returning the concatenated ``assistant.message`` + content. The plain-text / ``--silent`` modes do not reliably stream the + response to stdout on all platforms, so JSONL is used for robust capture. + + The call runs in a clean temp cwd with streaming disabled and tools allowed + (so non-interactive mode never blocks on a permission prompt); ``_call``'s + prompts ask for final-answer text only, so no tool use is expected there, + while ``attempt_with_tools`` exposes real, cross-platform callable shims in + the working directory for honest tool-call detection. + + Startup overhead is minimised: each invocation points ``COPILOT_HOME`` at a + dedicated, isolated config dir (no user ``mcp-config.json``, so the user's + MCP servers — including this project's own — are NOT spawned, avoiding a + slow recursive launch), and built-in MCP servers / custom instructions are + disabled. Auth is read from the OS credential store / token env vars, which + live outside ``COPILOT_HOME``, so isolation does not break authentication. + Set ``SKILLOPT_SLEEP_COPILOT_HOME`` to override the isolated home, or set it + empty / ``SKILLOPT_SLEEP_COPILOT_FULL_ENV=1`` to use the user's real + environment instead. + """ + + name = "copilot" + + def __init__(self, model: str = "", copilot_path: str = "", timeout: int = 240) -> None: + super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_COPILOT_MODEL", ""), + timeout=timeout) + self.copilot_path = resolve_copilot_path(copilot_path) + self.full_env = os.environ.get("SKILLOPT_SLEEP_COPILOT_FULL_ENV", "") == "1" + # Stable isolated home so first-run setup is cached across calls. + if self.full_env: + self.copilot_home = "" + else: + self.copilot_home = os.environ.get("SKILLOPT_SLEEP_COPILOT_HOME") or os.path.join( + tempfile.gettempdir(), "skillopt_sleep_copilot_home" + ) + try: + os.makedirs(self.copilot_home, exist_ok=True) + except Exception: + self.copilot_home = "" + + def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: + clean_cwd = tempfile.mkdtemp(prefix="skillopt_sleep_copilot_") + cmd = [ + self.copilot_path, "-p", prompt, + "--output-format", "json", + "--stream", "off", + "--no-color", + "--log-level", "none", + "--allow-all-tools", + "-C", clean_cwd, + ] + if not self.full_env: + # Drop unneeded startup work: no built-in (github) MCP server and no + # AGENTS.md / custom-instruction loading. With an isolated home that + # has no mcp-config.json, no user MCP servers spawn either. + cmd += ["--disable-builtin-mcps", "--no-custom-instructions"] + if self.model: + cmd += ["--model", self.model] + env = os.environ.copy() + if self.copilot_home: + env["COPILOT_HOME"] = self.copilot_home + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, timeout=self.timeout, cwd=clean_cwd, + encoding="utf-8", errors="replace", env=env, + ) + except Exception: + return "" + finally: + try: + import shutil + shutil.rmtree(clean_cwd, ignore_errors=True) + except Exception: + pass + return self._parse_jsonl_response(proc.stdout or "") + + @staticmethod + def _parse_jsonl_response(raw: str) -> str: + parts: List[str] = [] + for line in raw.splitlines(): + line = line.strip() + if not line or not line.startswith("{"): + continue + try: + obj = json.loads(line) + except Exception: + continue + if obj.get("type") == "assistant.message": + content = (obj.get("data") or {}).get("content") + if isinstance(content, str) and content: + parts.append(content) + return "\n".join(parts).strip() + + def attempt_with_tools(self, task, skill, memory, tools): + # Expose REAL, callable tool shims in the working directory so the + # gbrain quick-answerer judge (tool_called=search) is validated + # honestly: we detect each call from the shim's log, not from a + # self-reported marker. The Copilot CLI is the Windows-validated + # backend, so the shims must be cross-platform — a bash `#!/usr/bin/env + # bash` + chmod shim does NOT execute via `./tool` under PowerShell/cmd, + # so on Windows we emit a `.cmd` batch shim instead. + import shutil + import stat + work = tempfile.mkdtemp(prefix="skillopt_sleep_copilottools_") + calllog = os.path.join(work, "_tool_calls.log") + tool_names = tools or ["search"] + is_windows = os.name == "nt" + try: + for tname in tool_names: + if is_windows: + shim = os.path.join(work, f"{tname}.cmd") + with open(shim, "w") as f: + # `%~n0` is the script's own base name (the tool name); + # writing it keeps the calllog line == tool name so the + # honest-detection match below works unchanged. + f.write( + "@echo off\n" + f'echo %~n0>>"{calllog}"\n' + "echo (search results: 3 relevant notes found; use them to answer)\n" + ) + else: + shim = os.path.join(work, tname) + with open(shim, "w") as f: + f.write( + "#!/usr/bin/env bash\n" + f'echo "{tname}" >> "{calllog}"\n' + 'echo "(search results: 3 relevant notes found; use them to answer)"\n' + ) + os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) + if is_windows: + tool_hint = ( + "You have shell tools available in the current directory: " + + ", ".join(f"{t}.cmd" for t in tool_names) + + " (each callable as `" + tool_names[0] + "` or `.\\" + + tool_names[0] + "`). When the skill says to look something " + "up or search before answering, you MUST actually run the " + "tool (e.g. `" + tool_names[0] + " \"query\"`) before giving " + "your final answer." + ) + else: + tool_hint = ( + "You have shell tools available in the current directory: " + + ", ".join(f"./{t}" for t in tool_names) + + ". When the skill says to look something up or search before " + "answering, you MUST actually run the tool (e.g. `./search \"query\"`) " + "before giving your final answer." + ) + prompt = ( + "You are completing a task. Apply the skill and memory rules EXACTLY, " + "including any rule about searching/looking up before answering. " + "Treat a 'Learned preferences' block as HARD CONSTRAINTS that override " + "earlier conflicting skill text.\n\n" + f"{tool_hint}\n\n" + f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n" + f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n" + "Return ONLY the final answer text." + ) + cmd = [ + self.copilot_path, "-p", prompt, + "--output-format", "json", + "--stream", "off", + "--no-color", + "--log-level", "none", + "--allow-all-tools", + "-C", work, + ] + if not self.full_env: + cmd += ["--disable-builtin-mcps", "--no-custom-instructions"] + if self.model: + cmd += ["--model", self.model] + env = os.environ.copy() + if self.copilot_home: + env["COPILOT_HOME"] = self.copilot_home + resp = "" + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, encoding="utf-8", + errors="replace", timeout=self.timeout, cwd=work, env=env, + ) + resp = self._parse_jsonl_response(proc.stdout or "") + except Exception: + resp = "" + self._tokens += len(prompt) // 4 + len(resp) // 4 + called: List[str] = [] + if os.path.exists(calllog): + with open(calllog) as f: + logged = {ln.strip() for ln in f if ln.strip()} + called = [t for t in tool_names if t in logged] + return resp, called + finally: + try: + shutil.rmtree(work, ignore_errors=True) + except Exception: + pass + + class DualBackend(Backend): """Route operations to two backends, à la SkillOpt's target vs optimizer. @@ -753,6 +966,8 @@ def get_backend( return ClaudeCliBackend(model=model, claude_path=claude_path) if n in {"codex", "codex_cli", "openai_codex"}: return CodexCliBackend(model=model, codex_path=codex_path) + if n in {"copilot", "github_copilot", "copilot_cli", "gh_copilot"}: + return CopilotCliBackend(model=model) return MockBackend() diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py index 75415273..98036411 100644 --- a/skillopt_sleep/config.py +++ b/skillopt_sleep/config.py @@ -34,7 +34,7 @@ "val_fraction": 0.34, # real tasks reserved to gate updates "test_fraction": 0.0, # real tasks reserved as the final held-out measure # ── optimizer ────────────────────────────────────────────────────────── - "backend": "mock", # "mock" | "claude" | "codex" + "backend": "mock", # "mock" | "claude" | "codex" | "copilot" "model": "", # backend-specific; "" => backend default "gate_mode": "on", # "on" (validation-gated) | "off" (greedy, no hard filter) "codex_path": "", # "" => auto-detect the real @openai/codex binary diff --git a/skillopt_sleep/experiments/run_experiment.py b/skillopt_sleep/experiments/run_experiment.py index 91a9ca99..1110f260 100644 --- a/skillopt_sleep/experiments/run_experiment.py +++ b/skillopt_sleep/experiments/run_experiment.py @@ -134,7 +134,7 @@ def main(argv=None) -> int: ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment") ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys())) ap.add_argument("--nights", type=int, default=4) - ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex"]) + ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex", "copilot"]) ap.add_argument("--model", default="", help="backend model override") ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary") ap.add_argument("--edit-budget", type=int, default=4) diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index 2a28dce3..8e283339 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -418,5 +418,124 @@ def test_cycle_stage_then_adopt_with_backup(self): self.assertIn("answer", f.read().lower()) +class TestCopilotBackend(unittest.TestCase): + """Pure-logic tests for CopilotCliBackend — no `copilot` CLI required.""" + + def test_alias_resolution(self): + from skillopt_sleep.backend import CopilotCliBackend, get_backend + for name in ("copilot", "github_copilot", "copilot_cli", "gh_copilot"): + self.assertIsInstance(get_backend(name), CopilotCliBackend, name) + + def test_parse_jsonl_concatenates_assistant_messages(self): + from skillopt_sleep.backend import CopilotCliBackend + raw = "\n".join([ + '{"type":"session.info","data":{}}', + '{"type":"assistant.message","data":{"content":"hello"}}', + 'not-json-noise', + '{"type":"user.message","data":{"content":"ignored"}}', + '{"type":"assistant.message","data":{"content":"world"}}', + ]) + self.assertEqual(CopilotCliBackend._parse_jsonl_response(raw), "hello\nworld") + + def test_parse_jsonl_ignores_non_assistant_and_blank(self): + from skillopt_sleep.backend import CopilotCliBackend + self.assertEqual(CopilotCliBackend._parse_jsonl_response(""), "") + self.assertEqual( + CopilotCliBackend._parse_jsonl_response('{"type":"result","data":{"content":"x"}}'), + "", + ) + # assistant.message with empty/missing content contributes nothing + self.assertEqual( + CopilotCliBackend._parse_jsonl_response( + '{"type":"assistant.message","data":{"content":""}}\n' + '{"type":"assistant.message","data":{}}' + ), + "", + ) + + def test_isolated_home_by_default(self): + from skillopt_sleep.backend import CopilotCliBackend + be = CopilotCliBackend() + self.assertFalse(be.full_env) + self.assertTrue(be.copilot_home) # an isolated COPILOT_HOME is set + + def test_full_env_opt_out(self): + from skillopt_sleep.backend import CopilotCliBackend + prev = os.environ.get("SKILLOPT_SLEEP_COPILOT_FULL_ENV") + os.environ["SKILLOPT_SLEEP_COPILOT_FULL_ENV"] = "1" + try: + be = CopilotCliBackend() + self.assertTrue(be.full_env) + self.assertEqual(be.copilot_home, "") # real user environment used + finally: + if prev is None: + os.environ.pop("SKILLOPT_SLEEP_COPILOT_FULL_ENV", None) + else: + os.environ["SKILLOPT_SLEEP_COPILOT_FULL_ENV"] = prev + + def test_home_override_env(self): + from skillopt_sleep.backend import CopilotCliBackend + with tempfile.TemporaryDirectory() as d: + target = os.path.join(d, "myhome") + prev = os.environ.get("SKILLOPT_SLEEP_COPILOT_HOME") + os.environ["SKILLOPT_SLEEP_COPILOT_HOME"] = target + try: + be = CopilotCliBackend() + self.assertEqual(be.copilot_home, target) + self.assertTrue(os.path.isdir(target)) # created on init + finally: + if prev is None: + os.environ.pop("SKILLOPT_SLEEP_COPILOT_HOME", None) + else: + os.environ["SKILLOPT_SLEEP_COPILOT_HOME"] = prev + + def test_attempt_with_tools_honest_detection(self): + # End-to-end (no real CLI): a tiny per-OS stub stands in for `copilot`. + # It runs the local `search` shim the backend writes into its work dir + # (so the calllog is written — honest detection) then prints one JSONL + # assistant.message. Proves both the JSONL parse and that the tool call + # is detected from the shim's log, not from a self-reported marker. + import shutil + import stat + + from skillopt_sleep.backend import CopilotCliBackend + + stub_dir = tempfile.mkdtemp(prefix="skillopt_sleep_stub_") + try: + if os.name == "nt": + stub = os.path.join(stub_dir, "copilot.cmd") + with open(stub, "w") as f: + # The backend writes `search.cmd`; run it (explicit `.\` so + # cmd's `call` resolves it from the cwd reliably) so the + # calllog is populated, then emit the JSONL line. None of + # `{ } " :` need escaping in batch echo (no > < | & ^ %). + f.write( + "@echo off\n" + 'call .\\search.cmd "q" >nul 2>&1\n' + 'echo {"type":"assistant.message","data":{"content":"Paris"}}\n' + ) + else: + stub = os.path.join(stub_dir, "copilot") + with open(stub, "w") as f: + f.write( + "#!/usr/bin/env bash\n" + './search "q" >/dev/null 2>&1\n' + "echo '{\"type\":\"assistant.message\",\"data\":{\"content\":\"Paris\"}}'\n" + ) + os.chmod( + stub, + os.stat(stub).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH, + ) + + be = CopilotCliBackend(copilot_path=stub, timeout=60) + task = TaskRecord(id="t1", project="p", intent="What is the capital of France?") + resp, called = be.attempt_with_tools(task, skill="", memory="", tools=["search"]) + + self.assertEqual(resp, "Paris") # JSONL parsed via _parse_jsonl_response + self.assertEqual(called, ["search"]) # shim ran; detected from calllog + finally: + shutil.rmtree(stub_dir, ignore_errors=True) + + if __name__ == "__main__": unittest.main(verbosity=2)