diff --git a/plugins/copilot/README.md b/plugins/copilot/README.md
index c5a32c78..c51625f8 100644
--- a/plugins/copilot/README.md
+++ b/plugins/copilot/README.md
@@ -45,8 +45,17 @@ Ask Copilot things like *"run the sleep cycle"*, *"what did the last sleep
propose?"*, *"adopt the staged sleep proposal"*. Copilot calls the MCP tools:
`sleep_status`, `sleep_dry_run`, `sleep_run`, `sleep_adopt`, `sleep_harvest`.
-Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`), and
-`scope` arguments. Default backend is `mock` (no API spend).
+Each tool takes optional `project`, `backend` (`mock`/`claude`/`codex`/`copilot`), and
+`scope` arguments. Default backend is `mock` (no API spend). The `copilot`
+backend drives the GitHub Copilot CLI (`copilot -p ... --output-format json`)
+and requires the `copilot` CLI to be installed and authenticated.
+
+For speed, the `copilot` backend runs each call against an isolated
+`COPILOT_HOME` with built-in MCP servers and custom instructions disabled, so
+your user MCP servers (including this project's own) are not spawned per call
+(~5x faster). Override with `SKILLOPT_SLEEP_COPILOT_HOME=
`, pick a model
+with `SKILLOPT_SLEEP_COPILOT_MODEL`, or set `SKILLOPT_SLEEP_COPILOT_FULL_ENV=1`
+to use your real Copilot environment instead.
## Verify the server directly (no Copilot needed)
diff --git a/plugins/copilot/mcp_server.py b/plugins/copilot/mcp_server.py
index d03a95b6..2c592aea 100755
--- a/plugins/copilot/mcp_server.py
+++ b/plugins/copilot/mcp_server.py
@@ -45,8 +45,8 @@
"type": "object",
"properties": {
"project": {"type": "string", "description": "Project dir to evolve (default: cwd)."},
- "backend": {"type": "string", "enum": ["mock", "claude", "codex"],
- "description": "mock = no API spend (default); claude/codex = real."},
+ "backend": {"type": "string", "enum": ["mock", "claude", "codex", "copilot"],
+ "description": "mock = no API spend (default); claude/codex/copilot = real."},
"scope": {"type": "string", "enum": ["invoked", "all"]},
},
"additionalProperties": False,
diff --git a/plugins/copilot/skillopt/README.md b/plugins/copilot/skillopt/README.md
new file mode 100644
index 00000000..c4910a23
--- /dev/null
+++ b/plugins/copilot/skillopt/README.md
@@ -0,0 +1,98 @@
+# SkillOpt — GitHub Copilot integration
+
+Give **Copilot** (CLI or VS Code) direct access to the **SkillOpt** research
+engine via a tiny **MCP server**. MCP is GitHub's supported way to extend
+Copilot, so this works across Copilot CLI, VS Code, and other MCP clients with
+the same server.
+
+SkillOpt is **validation-gated, text-space skill optimization**: it reflects on
+rollouts, makes bounded edits to a skill, and keeps a change only if it improves
+a held-out validation set. This plugin exposes the repo's training and eval
+entry points (`scripts/train.py`, `scripts/eval_only.py`) as Copilot tools.
+
+> This is the companion to the **SkillOpt-Sleep** plugin (`../mcp_server.py`,
+> `sleep_*` tools). Sleep evolves a *local coding agent* from your past
+> sessions; this server drives the *research* training/eval loops on the
+> benchmark configs in [`../../../configs`](../../../configs).
+
+## What's here
+
+| File | Purpose |
+|---|---|
+| `mcp_server.py` | stdlib-only MCP (stdio) server exposing `skillopt_*` tools |
+| `mcp-config.example.json` | drop-in MCP server config |
+| `copilot-instructions.snippet.md` | paste into `.github/copilot-instructions.md` |
+
+## Install
+
+Requires Python ≥ 3.10. The MCP server itself is pure stdlib, but the tools it
+launches need SkillOpt's runtime deps — install the package first:
+
+```bash
+pip install -e . # or: pip install -r requirements.txt
+```
+
+1. **Register the MCP server.** Add the server to your Copilot MCP config
+ (Copilot CLI: `~/.copilot/mcp-config.json`; VS Code: your MCP settings).
+ Use `mcp-config.example.json` as a template — set `SKILLOPT_REPO` to this
+ repo's path:
+
+ ```json
+ {
+ "mcpServers": {
+ "skillopt": {
+ "command": "python3",
+ "args": ["/abs/path/SkillOpt/plugins/copilot/skillopt/mcp_server.py"],
+ "env": { "SKILLOPT_REPO": "/abs/path/SkillOpt" }
+ }
+ }
+ }
+ ```
+
+2. **(Optional) Tell Copilot about it.** Append
+ `copilot-instructions.snippet.md` to your repo's
+ `.github/copilot-instructions.md` so Copilot reaches for the tools when the
+ user asks to "optimize a skill" or "train on a benchmark".
+
+## Use
+
+Ask Copilot things like *"what configs can I run?"*, *"optimize the searchqa
+skill"*, or *"evaluate this skill on the dataset"*. Copilot calls the MCP tools:
+`skillopt_list_configs`, `skillopt_train`, `skillopt_eval`.
+
+| Tool | Required args | Notes |
+|---|---|---|
+| `skillopt_list_configs` | — | Lists `configs/**/*.yaml` you can pass as `config`. |
+| `skillopt_train` | `config` | Runs a reflective optimization loop. Long-running; spends budget. |
+| `skillopt_eval` | `config`, `skill` | Evaluates one skill markdown file; no training. |
+
+Common optional args (both train and eval): `env`, `backend`,
+`optimizer_model`, `target_model`, `out_root`, `cfg_options` (space-separated
+`KEY=VALUE` YAML overrides), and `extra_args` (raw passthrough flags for the
+underlying script). `skillopt_train` also accepts `num_epochs`, `batch_size`,
+`seed`, and `use_gate`.
+
+Runs can be very long. The server's subprocess timeout defaults to 6 hours;
+override it with the `SKILLOPT_RUN_TIMEOUT` environment variable (seconds).
+
+## Verify the server directly (no Copilot needed)
+
+```bash
+printf '%s\n' \
+ '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' \
+ '{"jsonrpc":"2.0","id":2,"method":"tools/list"}' \
+ '{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"skillopt_list_configs","arguments":{}}}' \
+ | SKILLOPT_REPO="$(pwd)" python3 plugins/copilot/skillopt/mcp_server.py
+```
+
+You should see the server info, the three `skillopt_*` tools, and the list of
+benchmark configs.
+
+## Notes / status
+
+- MCP is the stable, official Copilot extension surface, so this is portable
+ across Copilot CLI and IDE from one server.
+- `skillopt_list_configs` is filesystem-only and safe to call anytime;
+ `skillopt_train` / `skillopt_eval` shell out to the repo scripts and require
+ the SkillOpt runtime deps (and, for real backends, model credentials — see
+ [`../../../.env.example`](../../../.env.example)).
diff --git a/plugins/copilot/skillopt/copilot-instructions.snippet.md b/plugins/copilot/skillopt/copilot-instructions.snippet.md
new file mode 100644
index 00000000..b53c4a5d
--- /dev/null
+++ b/plugins/copilot/skillopt/copilot-instructions.snippet.md
@@ -0,0 +1,33 @@
+
+
+## SkillOpt (research skill-optimization engine)
+
+This repo exposes the core **SkillOpt** training/eval engine via an MCP server
+(`skillopt`). SkillOpt is validation-gated, text-space skill optimization: it
+reflects on rollouts, makes bounded edits to a skill, and keeps a change only
+if it improves a held-out validation set.
+
+When the user asks to "optimize a skill", "train on ", "run
+SkillOpt", "evaluate this skill", or "what configs can I run", use the MCP
+tools:
+
+- `skillopt_list_configs` — list the benchmark YAML configs you can pass as `config`
+- `skillopt_train` — run a reflective skill-optimization loop on a config (long-running; spends API/compute budget)
+- `skillopt_eval` — evaluate a single skill markdown file on a dataset (no training)
+
+Guidance:
+- Always run `skillopt_list_configs` first if you don't already know a valid `config` path.
+- `skillopt_train` and `skillopt_eval` are long-running and consume the user's
+ model backend/budget — confirm the `config`, `backend`, and model choices
+ with the user before launching, and surface the held-out gate result when the
+ run finishes.
+- For one-off YAML overrides use `cfg_options` (e.g. `seed=123 batch_size=40`);
+ for any other underlying flag use `extra_args`.
+
+This is distinct from the **SkillOpt-Sleep** MCP server (`skillopt-sleep`,
+`sleep_*` tools), which evolves a local coding agent from past sessions rather
+than running the research benchmarks.
diff --git a/plugins/copilot/skillopt/mcp-config.example.json b/plugins/copilot/skillopt/mcp-config.example.json
new file mode 100644
index 00000000..eb2aba55
--- /dev/null
+++ b/plugins/copilot/skillopt/mcp-config.example.json
@@ -0,0 +1,11 @@
+{
+ "mcpServers": {
+ "skillopt": {
+ "command": "python3",
+ "args": ["plugins/copilot/skillopt/mcp_server.py"],
+ "env": {
+ "SKILLOPT_REPO": "${workspaceFolder}"
+ }
+ }
+ }
+}
diff --git a/plugins/copilot/skillopt/mcp_server.py b/plugins/copilot/skillopt/mcp_server.py
new file mode 100644
index 00000000..853877fd
--- /dev/null
+++ b/plugins/copilot/skillopt/mcp_server.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+"""SkillOpt (research engine) — minimal MCP server (stdio, stdlib-only).
+
+Exposes the core SkillOpt skill-optimization engine as MCP tools so any
+MCP-capable client (GitHub Copilot CLI / VS Code, Claude Desktop, etc.) can
+drive it. No third-party deps: speaks JSON-RPC 2.0 over stdio with just the
+handful of MCP methods clients need.
+
+This is the companion to the SkillOpt-Sleep MCP server (``../mcp_server.py``).
+Where Sleep evolves a *local agent* from past sessions, this server drives the
+*research* training/eval loops from this repo (``scripts/train.py`` /
+``scripts/eval_only.py``) against the benchmark configs in ``configs/``.
+
+Tools exposed:
+ - skillopt_list_configs : discover the benchmark YAML configs you can use
+ - skillopt_train : run a reflective skill-optimization (training) loop
+ - skillopt_eval : evaluate a single skill on a dataset (no training)
+
+``skillopt_train`` and ``skillopt_eval`` shell out to the repo's entry-point
+scripts and stream back their stdout/stderr. Configure your client to launch:
+ python plugins/copilot/skillopt/mcp_server.py
+"""
+from __future__ import annotations
+
+import glob
+import json
+import os
+import subprocess
+import sys
+
+# Repo root: three levels up from plugins/copilot/skillopt/mcp_server.py
+REPO_ROOT = os.environ.get("SKILLOPT_REPO") or os.path.abspath(
+ os.path.join(os.path.dirname(__file__), "..", "..", "..")
+)
+PROTOCOL_VERSION = "2024-11-05"
+
+# Training/eval runs are long; give the engine plenty of headroom.
+RUN_TIMEOUT_SECONDS = int(os.environ.get("SKILLOPT_RUN_TIMEOUT", "21600")) # 6h
+
+
+def _list_configs() -> str:
+ """List the benchmark configs available under configs/ (filesystem only)."""
+ pattern = os.path.join(REPO_ROOT, "configs", "**", "*.yaml")
+ paths = sorted(glob.glob(pattern, recursive=True))
+ if not paths:
+ return f"[no configs found under {os.path.join(REPO_ROOT, 'configs')}]"
+ rels = [os.path.relpath(p, REPO_ROOT).replace(os.sep, "/") for p in paths]
+ lines = ["Available SkillOpt configs (pass as `config`):", ""]
+ lines += [f" - {r}" for r in rels]
+ return "\n".join(lines)
+
+
+def _run_script(script_rel: str, args: dict, *, required: tuple[str, ...] = ()) -> str:
+ """Shell out to a repo entry-point script, mapping args -> --flags."""
+ for key in required:
+ if not args.get(key):
+ return f"[error] missing required argument: {key}"
+
+ py = sys.executable or "python3"
+ cmd = [py, os.path.join("scripts", script_rel)]
+
+ # Ordered flags that the train/eval scripts accept directly.
+ flag_args = (
+ "config", "skill", "split", "env", "backend",
+ "optimizer_model", "target_model", "out_root",
+ "num_epochs", "batch_size", "seed", "use_gate",
+ )
+ for key in flag_args:
+ val = args.get(key)
+ if val is None or val == "":
+ continue
+ cmd += [f"--{key}", str(val)]
+
+ # cfg-options: arbitrary KEY=VALUE YAML overrides (nargs="+").
+ cfg_options = args.get("cfg_options")
+ if cfg_options:
+ if isinstance(cfg_options, str):
+ cfg_options = cfg_options.split()
+ cmd += ["--cfg-options", *[str(x) for x in cfg_options]]
+
+ # extra_args: raw passthrough for any other train/eval flag.
+ extra = args.get("extra_args")
+ if extra:
+ if isinstance(extra, str):
+ extra = extra.split()
+ cmd += [str(x) for x in extra]
+
+ try:
+ proc = subprocess.run(
+ cmd, cwd=REPO_ROOT, capture_output=True, text=True,
+ timeout=RUN_TIMEOUT_SECONDS,
+ )
+ except subprocess.TimeoutExpired:
+ return f"[error] run exceeded {RUN_TIMEOUT_SECONDS}s timeout: {' '.join(cmd)}"
+ except Exception as e: # noqa: BLE001
+ return f"[error] failed to run script: {e}"
+ out = (proc.stdout or "").strip()
+ err = (proc.stderr or "").strip()
+ body = out + (("\n[stderr]\n" + err) if err else "")
+ return body or f"[done] exit code {proc.returncode}, no output"
+
+
+TOOLS = [
+ {
+ "name": "skillopt_list_configs",
+ "description": "List the benchmark YAML configs under configs/ that can be passed as `config` to train/eval.",
+ },
+ {
+ "name": "skillopt_train",
+ "description": "Run a SkillOpt reflective skill-optimization (training) loop on a benchmark config. Long-running; uses your model backend/budget.",
+ },
+ {
+ "name": "skillopt_eval",
+ "description": "Evaluate a single skill markdown file on a dataset without training (scripts/eval_only.py).",
+ },
+]
+_BY_NAME = {t["name"]: t for t in TOOLS}
+
+_NO_ARGS_SCHEMA = {"type": "object", "properties": {}, "additionalProperties": False}
+
+_COMMON_PROPS = {
+ "config": {"type": "string",
+ "description": "Path to a benchmark YAML config (e.g. configs/searchqa/default.yaml). See skillopt_list_configs."},
+ "env": {"type": "string", "description": "Override the environment/adapter name (e.g. searchqa, alfworld)."},
+ "backend": {"type": "string", "description": "Model backend (e.g. azure_openai, claude, codex, qwen, minimax)."},
+ "optimizer_model": {"type": "string", "description": "Model used for reflection/skill rewriting (the optimizer)."},
+ "target_model": {"type": "string", "description": "Model used to execute tasks (the target)."},
+ "out_root": {"type": "string", "description": "Output directory root for run artifacts."},
+ "cfg_options": {"type": "string", "description": "Space-separated YAML overrides, e.g. 'seed=123 batch_size=40'."},
+ "extra_args": {"type": "string", "description": "Raw passthrough flags for the underlying script, e.g. '--workers 8 --max_turns 30'."},
+}
+
+_TRAIN_SCHEMA = {
+ "type": "object",
+ "properties": {
+ **_COMMON_PROPS,
+ "num_epochs": {"type": "integer", "description": "Number of optimization epochs."},
+ "batch_size": {"type": "integer", "description": "Tasks per optimization step."},
+ "seed": {"type": "integer", "description": "Random seed."},
+ "use_gate": {"type": "string", "enum": ["true", "false"],
+ "description": "Whether to keep the held-out validation gate on (default on)."},
+ },
+ "required": ["config"],
+ "additionalProperties": False,
+}
+
+_EVAL_SCHEMA = {
+ "type": "object",
+ "properties": {
+ **_COMMON_PROPS,
+ "skill": {"type": "string", "description": "Path to the skill markdown file to evaluate."},
+ "split": {"type": "string", "description": "Dataset split to evaluate (default: all)."},
+ },
+ "required": ["config", "skill"],
+ "additionalProperties": False,
+}
+
+_SCHEMA_BY_NAME = {
+ "skillopt_list_configs": _NO_ARGS_SCHEMA,
+ "skillopt_train": _TRAIN_SCHEMA,
+ "skillopt_eval": _EVAL_SCHEMA,
+}
+
+
+def _result(id_, result):
+ return {"jsonrpc": "2.0", "id": id_, "result": result}
+
+
+def _error(id_, code, message):
+ return {"jsonrpc": "2.0", "id": id_, "error": {"code": code, "message": message}}
+
+
+def _dispatch(name: str, args: dict) -> str:
+ if name == "skillopt_list_configs":
+ return _list_configs()
+ if name == "skillopt_train":
+ return _run_script("train.py", args, required=("config",))
+ if name == "skillopt_eval":
+ return _run_script("eval_only.py", args, required=("config", "skill"))
+ return f"[error] unknown tool: {name}"
+
+
+def handle(req: dict):
+ method = req.get("method")
+ id_ = req.get("id")
+ if method == "initialize":
+ return _result(id_, {
+ "protocolVersion": PROTOCOL_VERSION,
+ "capabilities": {"tools": {}},
+ "serverInfo": {"name": "skillopt", "version": "0.1.0"},
+ })
+ if method in ("notifications/initialized", "initialized"):
+ return None # notification, no response
+ if method == "tools/list":
+ return _result(id_, {"tools": [
+ {"name": t["name"], "description": t["description"],
+ "inputSchema": _SCHEMA_BY_NAME[t["name"]]}
+ for t in TOOLS
+ ]})
+ if method == "tools/call":
+ params = req.get("params") or {}
+ name = params.get("name")
+ if name not in _BY_NAME:
+ return _error(id_, -32602, f"unknown tool: {name}")
+ text = _dispatch(name, params.get("arguments") or {})
+ return _result(id_, {"content": [{"type": "text", "text": text}]})
+ if method == "ping":
+ return _result(id_, {})
+ return _error(id_, -32601, f"method not found: {method}")
+
+
+def main() -> int:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ req = json.loads(line)
+ except Exception:
+ continue
+ resp = handle(req)
+ if resp is not None:
+ sys.stdout.write(json.dumps(resp) + "\n")
+ sys.stdout.flush()
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/skillopt_sleep/__main__.py b/skillopt_sleep/__main__.py
index f2efa3e9..440bf85d 100644
--- a/skillopt_sleep/__main__.py
+++ b/skillopt_sleep/__main__.py
@@ -9,7 +9,7 @@
Common flags:
--project PATH project to evolve (default: cwd)
--scope all|invoked harvest scope (default: invoked)
- --backend mock|anthropic
+ --backend mock|claude|codex|copilot
--model NAME
--lookback-hours N
--auto-adopt
@@ -34,7 +34,7 @@
def _add_common(p: argparse.ArgumentParser) -> None:
p.add_argument("--project", default="")
p.add_argument("--scope", default="", choices=["", "all", "invoked"])
- p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex"])
+ p.add_argument("--backend", default="", choices=["", "mock", "claude", "codex", "copilot"])
p.add_argument("--model", default="")
p.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
p.add_argument("--claude-home", default="", help="override ~/.claude (also isolates state)")
diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py
index fbc8d269..6a22ca4d 100644
--- a/skillopt_sleep/backend.py
+++ b/skillopt_sleep/backend.py
@@ -24,6 +24,7 @@
import os
import re
import subprocess
+import tempfile
from typing import Any, Dict, List, Optional, Tuple
from skillopt_sleep.types import EditRecord, ReplayResult, TaskRecord
@@ -698,6 +699,218 @@ def attempt_with_tools(self, task, skill, memory, tools):
except Exception:
pass
+def resolve_copilot_path(explicit: str = "") -> str:
+ """Find the GitHub Copilot CLI (`copilot`) binary."""
+ if explicit:
+ return explicit
+ env = os.environ.get("SKILLOPT_SLEEP_COPILOT_PATH")
+ if env:
+ return env
+ import shutil
+ found = shutil.which("copilot")
+ return found or "copilot"
+
+
+class CopilotCliBackend(CliBackend):
+ """Drives the GitHub Copilot CLI in non-interactive mode.
+
+ Uses ``copilot -p --output-format json`` and parses the emitted
+ JSONL event stream, returning the concatenated ``assistant.message``
+ content. The plain-text / ``--silent`` modes do not reliably stream the
+ response to stdout on all platforms, so JSONL is used for robust capture.
+
+ The call runs in a clean temp cwd with streaming disabled and tools allowed
+ (so non-interactive mode never blocks on a permission prompt); ``_call``'s
+ prompts ask for final-answer text only, so no tool use is expected there,
+ while ``attempt_with_tools`` exposes real, cross-platform callable shims in
+ the working directory for honest tool-call detection.
+
+ Startup overhead is minimised: each invocation points ``COPILOT_HOME`` at a
+ dedicated, isolated config dir (no user ``mcp-config.json``, so the user's
+ MCP servers — including this project's own — are NOT spawned, avoiding a
+ slow recursive launch), and built-in MCP servers / custom instructions are
+ disabled. Auth is read from the OS credential store / token env vars, which
+ live outside ``COPILOT_HOME``, so isolation does not break authentication.
+ Set ``SKILLOPT_SLEEP_COPILOT_HOME`` to override the isolated home, or set it
+ empty / ``SKILLOPT_SLEEP_COPILOT_FULL_ENV=1`` to use the user's real
+ environment instead.
+ """
+
+ name = "copilot"
+
+ def __init__(self, model: str = "", copilot_path: str = "", timeout: int = 240) -> None:
+ super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_COPILOT_MODEL", ""),
+ timeout=timeout)
+ self.copilot_path = resolve_copilot_path(copilot_path)
+ self.full_env = os.environ.get("SKILLOPT_SLEEP_COPILOT_FULL_ENV", "") == "1"
+ # Stable isolated home so first-run setup is cached across calls.
+ if self.full_env:
+ self.copilot_home = ""
+ else:
+ self.copilot_home = os.environ.get("SKILLOPT_SLEEP_COPILOT_HOME") or os.path.join(
+ tempfile.gettempdir(), "skillopt_sleep_copilot_home"
+ )
+ try:
+ os.makedirs(self.copilot_home, exist_ok=True)
+ except Exception:
+ self.copilot_home = ""
+
+ def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
+ clean_cwd = tempfile.mkdtemp(prefix="skillopt_sleep_copilot_")
+ cmd = [
+ self.copilot_path, "-p", prompt,
+ "--output-format", "json",
+ "--stream", "off",
+ "--no-color",
+ "--log-level", "none",
+ "--allow-all-tools",
+ "-C", clean_cwd,
+ ]
+ if not self.full_env:
+ # Drop unneeded startup work: no built-in (github) MCP server and no
+ # AGENTS.md / custom-instruction loading. With an isolated home that
+ # has no mcp-config.json, no user MCP servers spawn either.
+ cmd += ["--disable-builtin-mcps", "--no-custom-instructions"]
+ if self.model:
+ cmd += ["--model", self.model]
+ env = os.environ.copy()
+ if self.copilot_home:
+ env["COPILOT_HOME"] = self.copilot_home
+ try:
+ proc = subprocess.run(
+ cmd, capture_output=True, text=True, timeout=self.timeout, cwd=clean_cwd,
+ encoding="utf-8", errors="replace", env=env,
+ )
+ except Exception:
+ return ""
+ finally:
+ try:
+ import shutil
+ shutil.rmtree(clean_cwd, ignore_errors=True)
+ except Exception:
+ pass
+ return self._parse_jsonl_response(proc.stdout or "")
+
+ @staticmethod
+ def _parse_jsonl_response(raw: str) -> str:
+ parts: List[str] = []
+ for line in raw.splitlines():
+ line = line.strip()
+ if not line or not line.startswith("{"):
+ continue
+ try:
+ obj = json.loads(line)
+ except Exception:
+ continue
+ if obj.get("type") == "assistant.message":
+ content = (obj.get("data") or {}).get("content")
+ if isinstance(content, str) and content:
+ parts.append(content)
+ return "\n".join(parts).strip()
+
+ def attempt_with_tools(self, task, skill, memory, tools):
+ # Expose REAL, callable tool shims in the working directory so the
+ # gbrain quick-answerer judge (tool_called=search) is validated
+ # honestly: we detect each call from the shim's log, not from a
+ # self-reported marker. The Copilot CLI is the Windows-validated
+ # backend, so the shims must be cross-platform — a bash `#!/usr/bin/env
+ # bash` + chmod shim does NOT execute via `./tool` under PowerShell/cmd,
+ # so on Windows we emit a `.cmd` batch shim instead.
+ import shutil
+ import stat
+ work = tempfile.mkdtemp(prefix="skillopt_sleep_copilottools_")
+ calllog = os.path.join(work, "_tool_calls.log")
+ tool_names = tools or ["search"]
+ is_windows = os.name == "nt"
+ try:
+ for tname in tool_names:
+ if is_windows:
+ shim = os.path.join(work, f"{tname}.cmd")
+ with open(shim, "w") as f:
+ # `%~n0` is the script's own base name (the tool name);
+ # writing it keeps the calllog line == tool name so the
+ # honest-detection match below works unchanged.
+ f.write(
+ "@echo off\n"
+ f'echo %~n0>>"{calllog}"\n'
+ "echo (search results: 3 relevant notes found; use them to answer)\n"
+ )
+ else:
+ shim = os.path.join(work, tname)
+ with open(shim, "w") as f:
+ f.write(
+ "#!/usr/bin/env bash\n"
+ f'echo "{tname}" >> "{calllog}"\n'
+ 'echo "(search results: 3 relevant notes found; use them to answer)"\n'
+ )
+ os.chmod(shim, os.stat(shim).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
+ if is_windows:
+ tool_hint = (
+ "You have shell tools available in the current directory: "
+ + ", ".join(f"{t}.cmd" for t in tool_names)
+ + " (each callable as `" + tool_names[0] + "` or `.\\"
+ + tool_names[0] + "`). When the skill says to look something "
+ "up or search before answering, you MUST actually run the "
+ "tool (e.g. `" + tool_names[0] + " \"query\"`) before giving "
+ "your final answer."
+ )
+ else:
+ tool_hint = (
+ "You have shell tools available in the current directory: "
+ + ", ".join(f"./{t}" for t in tool_names)
+ + ". When the skill says to look something up or search before "
+ "answering, you MUST actually run the tool (e.g. `./search \"query\"`) "
+ "before giving your final answer."
+ )
+ prompt = (
+ "You are completing a task. Apply the skill and memory rules EXACTLY, "
+ "including any rule about searching/looking up before answering. "
+ "Treat a 'Learned preferences' block as HARD CONSTRAINTS that override "
+ "earlier conflicting skill text.\n\n"
+ f"{tool_hint}\n\n"
+ f"# Skill\n{skill or '(none)'}\n\n# Memory\n{memory or '(none)'}\n\n"
+ f"# Task\n{task.intent}\n\n{task.context_excerpt}\n\n"
+ "Return ONLY the final answer text."
+ )
+ cmd = [
+ self.copilot_path, "-p", prompt,
+ "--output-format", "json",
+ "--stream", "off",
+ "--no-color",
+ "--log-level", "none",
+ "--allow-all-tools",
+ "-C", work,
+ ]
+ if not self.full_env:
+ cmd += ["--disable-builtin-mcps", "--no-custom-instructions"]
+ if self.model:
+ cmd += ["--model", self.model]
+ env = os.environ.copy()
+ if self.copilot_home:
+ env["COPILOT_HOME"] = self.copilot_home
+ resp = ""
+ try:
+ proc = subprocess.run(
+ cmd, capture_output=True, text=True, encoding="utf-8",
+ errors="replace", timeout=self.timeout, cwd=work, env=env,
+ )
+ resp = self._parse_jsonl_response(proc.stdout or "")
+ except Exception:
+ resp = ""
+ self._tokens += len(prompt) // 4 + len(resp) // 4
+ called: List[str] = []
+ if os.path.exists(calllog):
+ with open(calllog) as f:
+ logged = {ln.strip() for ln in f if ln.strip()}
+ called = [t for t in tool_names if t in logged]
+ return resp, called
+ finally:
+ try:
+ shutil.rmtree(work, ignore_errors=True)
+ except Exception:
+ pass
+
+
class DualBackend(Backend):
"""Route operations to two backends, à la SkillOpt's target vs optimizer.
@@ -753,6 +966,8 @@ def get_backend(
return ClaudeCliBackend(model=model, claude_path=claude_path)
if n in {"codex", "codex_cli", "openai_codex"}:
return CodexCliBackend(model=model, codex_path=codex_path)
+ if n in {"copilot", "github_copilot", "copilot_cli", "gh_copilot"}:
+ return CopilotCliBackend(model=model)
return MockBackend()
diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py
index 75415273..98036411 100644
--- a/skillopt_sleep/config.py
+++ b/skillopt_sleep/config.py
@@ -34,7 +34,7 @@
"val_fraction": 0.34, # real tasks reserved to gate updates
"test_fraction": 0.0, # real tasks reserved as the final held-out measure
# ── optimizer ──────────────────────────────────────────────────────────
- "backend": "mock", # "mock" | "claude" | "codex"
+ "backend": "mock", # "mock" | "claude" | "codex" | "copilot"
"model": "", # backend-specific; "" => backend default
"gate_mode": "on", # "on" (validation-gated) | "off" (greedy, no hard filter)
"codex_path": "", # "" => auto-detect the real @openai/codex binary
diff --git a/skillopt_sleep/experiments/run_experiment.py b/skillopt_sleep/experiments/run_experiment.py
index 91a9ca99..1110f260 100644
--- a/skillopt_sleep/experiments/run_experiment.py
+++ b/skillopt_sleep/experiments/run_experiment.py
@@ -134,7 +134,7 @@ def main(argv=None) -> int:
ap = argparse.ArgumentParser(description="SkillOpt-Sleep validation experiment")
ap.add_argument("--persona", default="researcher", choices=list(PERSONAS.keys()))
ap.add_argument("--nights", type=int, default=4)
- ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex"])
+ ap.add_argument("--backend", default="mock", choices=["mock", "claude", "codex", "copilot"])
ap.add_argument("--model", default="", help="backend model override")
ap.add_argument("--codex-path", default="", help="path to the real @openai/codex binary")
ap.add_argument("--edit-budget", type=int, default=4)
diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py
index 2a28dce3..8e283339 100644
--- a/tests/test_sleep_engine.py
+++ b/tests/test_sleep_engine.py
@@ -418,5 +418,124 @@ def test_cycle_stage_then_adopt_with_backup(self):
self.assertIn("answer", f.read().lower())
+class TestCopilotBackend(unittest.TestCase):
+ """Pure-logic tests for CopilotCliBackend — no `copilot` CLI required."""
+
+ def test_alias_resolution(self):
+ from skillopt_sleep.backend import CopilotCliBackend, get_backend
+ for name in ("copilot", "github_copilot", "copilot_cli", "gh_copilot"):
+ self.assertIsInstance(get_backend(name), CopilotCliBackend, name)
+
+ def test_parse_jsonl_concatenates_assistant_messages(self):
+ from skillopt_sleep.backend import CopilotCliBackend
+ raw = "\n".join([
+ '{"type":"session.info","data":{}}',
+ '{"type":"assistant.message","data":{"content":"hello"}}',
+ 'not-json-noise',
+ '{"type":"user.message","data":{"content":"ignored"}}',
+ '{"type":"assistant.message","data":{"content":"world"}}',
+ ])
+ self.assertEqual(CopilotCliBackend._parse_jsonl_response(raw), "hello\nworld")
+
+ def test_parse_jsonl_ignores_non_assistant_and_blank(self):
+ from skillopt_sleep.backend import CopilotCliBackend
+ self.assertEqual(CopilotCliBackend._parse_jsonl_response(""), "")
+ self.assertEqual(
+ CopilotCliBackend._parse_jsonl_response('{"type":"result","data":{"content":"x"}}'),
+ "",
+ )
+ # assistant.message with empty/missing content contributes nothing
+ self.assertEqual(
+ CopilotCliBackend._parse_jsonl_response(
+ '{"type":"assistant.message","data":{"content":""}}\n'
+ '{"type":"assistant.message","data":{}}'
+ ),
+ "",
+ )
+
+ def test_isolated_home_by_default(self):
+ from skillopt_sleep.backend import CopilotCliBackend
+ be = CopilotCliBackend()
+ self.assertFalse(be.full_env)
+ self.assertTrue(be.copilot_home) # an isolated COPILOT_HOME is set
+
+ def test_full_env_opt_out(self):
+ from skillopt_sleep.backend import CopilotCliBackend
+ prev = os.environ.get("SKILLOPT_SLEEP_COPILOT_FULL_ENV")
+ os.environ["SKILLOPT_SLEEP_COPILOT_FULL_ENV"] = "1"
+ try:
+ be = CopilotCliBackend()
+ self.assertTrue(be.full_env)
+ self.assertEqual(be.copilot_home, "") # real user environment used
+ finally:
+ if prev is None:
+ os.environ.pop("SKILLOPT_SLEEP_COPILOT_FULL_ENV", None)
+ else:
+ os.environ["SKILLOPT_SLEEP_COPILOT_FULL_ENV"] = prev
+
+ def test_home_override_env(self):
+ from skillopt_sleep.backend import CopilotCliBackend
+ with tempfile.TemporaryDirectory() as d:
+ target = os.path.join(d, "myhome")
+ prev = os.environ.get("SKILLOPT_SLEEP_COPILOT_HOME")
+ os.environ["SKILLOPT_SLEEP_COPILOT_HOME"] = target
+ try:
+ be = CopilotCliBackend()
+ self.assertEqual(be.copilot_home, target)
+ self.assertTrue(os.path.isdir(target)) # created on init
+ finally:
+ if prev is None:
+ os.environ.pop("SKILLOPT_SLEEP_COPILOT_HOME", None)
+ else:
+ os.environ["SKILLOPT_SLEEP_COPILOT_HOME"] = prev
+
+ def test_attempt_with_tools_honest_detection(self):
+ # End-to-end (no real CLI): a tiny per-OS stub stands in for `copilot`.
+ # It runs the local `search` shim the backend writes into its work dir
+ # (so the calllog is written — honest detection) then prints one JSONL
+ # assistant.message. Proves both the JSONL parse and that the tool call
+ # is detected from the shim's log, not from a self-reported marker.
+ import shutil
+ import stat
+
+ from skillopt_sleep.backend import CopilotCliBackend
+
+ stub_dir = tempfile.mkdtemp(prefix="skillopt_sleep_stub_")
+ try:
+ if os.name == "nt":
+ stub = os.path.join(stub_dir, "copilot.cmd")
+ with open(stub, "w") as f:
+ # The backend writes `search.cmd`; run it (explicit `.\` so
+ # cmd's `call` resolves it from the cwd reliably) so the
+ # calllog is populated, then emit the JSONL line. None of
+ # `{ } " :` need escaping in batch echo (no > < | & ^ %).
+ f.write(
+ "@echo off\n"
+ 'call .\\search.cmd "q" >nul 2>&1\n'
+ 'echo {"type":"assistant.message","data":{"content":"Paris"}}\n'
+ )
+ else:
+ stub = os.path.join(stub_dir, "copilot")
+ with open(stub, "w") as f:
+ f.write(
+ "#!/usr/bin/env bash\n"
+ './search "q" >/dev/null 2>&1\n'
+ "echo '{\"type\":\"assistant.message\",\"data\":{\"content\":\"Paris\"}}'\n"
+ )
+ os.chmod(
+ stub,
+ os.stat(stub).st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH,
+ )
+
+ be = CopilotCliBackend(copilot_path=stub, timeout=60)
+ task = TaskRecord(id="t1", project="p", intent="What is the capital of France?")
+ resp, called = be.attempt_with_tools(task, skill="", memory="", tools=["search"])
+
+ self.assertEqual(resp, "Paris") # JSONL parsed via _parse_jsonl_response
+ self.assertEqual(called, ["search"]) # shim ran; detected from calllog
+ finally:
+ shutil.rmtree(stub_dir, ignore_errors=True)
+
+
if __name__ == "__main__":
unittest.main(verbosity=2)