diff --git a/package.json b/package.json index ef23eac908..f2c394691b 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,7 @@ "lint:docs": "node scripts/validate-package-readmes.mjs", "lint:manifests": "node scripts/validate-package-manifests.mjs", "lint:workflows": "node scripts/lint-workflow-triggers.mjs", - "test:scripts": "node --test scripts/lint-workflow-triggers.test.mjs scripts/validate-skills.test.mjs scripts/determine-version-utils.test.ts scripts/check-upgrade-coverage.test.mjs scripts/set-version-utils.test.ts scripts/check-publish-deps-pn-pins.test.mjs scripts/publish-packages-utils.test.mjs scripts/check-clean-tree.test.mjs scripts/lint-casts.test.mjs scripts/sync-agent-rules.test.mjs skills-contrib/drive-diagnose-run/test/load.test.ts skills-contrib/drive-diagnose-run/test/metrics.test.ts skills-contrib/drive-diagnose-run/test/invariants.test.ts skills-contrib/drive-diagnose-run/test/cascade-brief.test.ts skills-contrib/drive-diagnose-run/test/report.test.ts skills-contrib/drive-diagnose-run/test/posthoc.test.ts skills-contrib/drive-diagnose-run/test/scorecard.test.ts skills-contrib/drive-record-traces/test/emit.test.ts skills-contrib/drive-judge-harness/test/usage.test.ts skills-contrib/drive-judge-harness/test/manifest.test.ts skills-contrib/drive-judge-harness/test/load-brief.test.ts skills-contrib/drive-judge-harness/test/run-one-brief.test.ts skills-contrib/drive-judge-harness/test/validate-parser.test.ts skills-contrib/drive-judge-harness/test/judge-model-sdk.test.ts skills-contrib/drive-judge-harness/test/rubric-correctness.test.ts skills-contrib/drive-judge-harness/test/classify-failure.test.ts skills-contrib/drive-judge-harness/test/classify-operator.test.ts skills-contrib/drive-judge-harness/test/emit-correctness.test.ts skills-contrib/drive-judge-harness/test/calibration.test.ts skills-contrib/drive-judge-harness/test/prepare-run.test.ts skills-contrib/drive-judge-harness/test/collect-run.test.ts skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts skills-contrib/drive-judge-harness/test/run-arm.test.ts", + "test:scripts": "node --test scripts/lint-workflow-triggers.test.mjs scripts/validate-skills.test.mjs scripts/determine-version-utils.test.ts scripts/check-upgrade-coverage.test.mjs scripts/set-version-utils.test.ts scripts/check-publish-deps-pn-pins.test.mjs scripts/publish-packages-utils.test.mjs scripts/check-clean-tree.test.mjs scripts/lint-casts.test.mjs scripts/sync-agent-rules.test.mjs skills-contrib/drive-diagnose-run/test/load.test.ts skills-contrib/drive-diagnose-run/test/metrics.test.ts skills-contrib/drive-diagnose-run/test/invariants.test.ts skills-contrib/drive-diagnose-run/test/cascade-brief.test.ts skills-contrib/drive-diagnose-run/test/report.test.ts skills-contrib/drive-diagnose-run/test/posthoc.test.ts skills-contrib/drive-diagnose-run/test/scorecard.test.ts skills-contrib/drive-record-traces/test/emit.test.ts skills-contrib/drive-judge-harness/test/usage.test.ts skills-contrib/drive-judge-harness/test/manifest.test.ts skills-contrib/drive-judge-harness/test/load-brief.test.ts skills-contrib/drive-judge-harness/test/run-one-brief.test.ts skills-contrib/drive-judge-harness/test/sdk-events.test.ts skills-contrib/drive-judge-harness/test/claude-events.test.ts skills-contrib/drive-judge-harness/test/validate-parser.test.ts skills-contrib/drive-judge-harness/test/judge-model-sdk.test.ts skills-contrib/drive-judge-harness/test/rubric-correctness.test.ts skills-contrib/drive-judge-harness/test/classify-failure.test.ts skills-contrib/drive-judge-harness/test/classify-operator.test.ts skills-contrib/drive-judge-harness/test/emit-correctness.test.ts skills-contrib/drive-judge-harness/test/calibration.test.ts skills-contrib/drive-judge-harness/test/prepare-run.test.ts skills-contrib/drive-judge-harness/test/collect-run.test.ts skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts skills-contrib/drive-judge-harness/test/run-arm.test.ts", "drive:diagnose": "node skills-contrib/drive-diagnose-run/cli.ts", "drive:emit": "node skills-contrib/drive-record-traces/emit.ts", "drive:run-brief": "node skills-contrib/drive-judge-harness/run-one-brief.ts", @@ -59,6 +59,7 @@ "prepare": "husky && skills add ./skills-contrib --skill '*' --agent universal claude-code -y && node scripts/sync-agent-rules.mjs" }, "devDependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.3.158", "@biomejs/biome": "2.4.15", "@cursor/sdk": "^1.0.15", "@prisma-next/tsconfig": "workspace:0.11.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 94f23bdd90..e5541db452 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -58,6 +58,9 @@ importers: .: devDependencies: + '@anthropic-ai/claude-agent-sdk': + specifier: ^0.3.158 + version: 0.3.158(@anthropic-ai/sdk@0.100.1(zod@3.25.76))(@modelcontextprotocol/sdk@1.29.0(zod@3.25.76))(zod@3.25.76) '@biomejs/biome': specifier: 2.4.15 version: 2.4.15 @@ -4572,6 +4575,67 @@ packages: resolution: {integrity: sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==} engines: {node: '>=10'} + '@anthropic-ai/claude-agent-sdk-darwin-arm64@0.3.158': + resolution: {integrity: sha512-9mlkVHeHIiF7oJUjVHbieYgsTzGmKRcgUmp52BhUaDL40Gm5AC0Lotqn0ULniqlr6pNWcbA0+gjEwg7VI9VtSA==} + cpu: [arm64] + os: [darwin] + + '@anthropic-ai/claude-agent-sdk-darwin-x64@0.3.158': + resolution: {integrity: sha512-3S4ef/f2ksTmUSEK6Di9Vch2Fm5udmZq8kVKO8mAdLV+VuG6KW9kYBzbogDtwYIZFuFo8xs1sPGP2hsdZvghhA==} + cpu: [x64] + os: [darwin] + + '@anthropic-ai/claude-agent-sdk-linux-arm64-musl@0.3.158': + resolution: {integrity: sha512-lJ2ZKKirs/RTAU+9IYTd+3CKE4vYe694FkRZ04TBQPiq/ujRUa3vmGm6gSIdmDlrdYMX5j4rdcun+Ym6mPXTtA==} + cpu: [arm64] + os: [linux] + libc: [musl] + + '@anthropic-ai/claude-agent-sdk-linux-arm64@0.3.158': + resolution: {integrity: sha512-ut9uJclBqrH5NhAuVc0zN84eQM3MP4DTQqh12eVUx83eekHu7l1v6Bg+N5P/m4SM4tEhKl8lQjLpliPtML4lUA==} + cpu: [arm64] + os: [linux] + libc: [glibc] + + '@anthropic-ai/claude-agent-sdk-linux-x64-musl@0.3.158': + resolution: {integrity: sha512-cU0NOOA9B8I6E58HejqtO/vsYg3rfWgoaDmrJ1BzM5J4eNS3iSeaxDm7MzcyvEbTHPC1Qgj89XoiDHqhf/V/vg==} + cpu: [x64] + os: [linux] + libc: [musl] + + '@anthropic-ai/claude-agent-sdk-linux-x64@0.3.158': + resolution: {integrity: sha512-PqcDGFuzvFA0JPYa11Xcoga13oQbbAGibfASmZG5+dhoq8SniUCj0LkGGnVAgTqX4SQIIMYklS6l7egwkJIi3w==} + cpu: [x64] + os: [linux] + libc: [glibc] + + '@anthropic-ai/claude-agent-sdk-win32-arm64@0.3.158': + resolution: {integrity: sha512-47S9BUuNOYuUGaMe9ZUaRMfd1UVRt1iP9UwHWqCJUsrTPNnTCY/7lW7aecEr7Z/h3JctegTvx6Iy+mp697R1hQ==} + cpu: [arm64] + os: [win32] + + '@anthropic-ai/claude-agent-sdk-win32-x64@0.3.158': + resolution: {integrity: sha512-YLjoU6Y+WN2nqnafbbEoVd+1ISaz2lHpArTnE+sNO8hOokBLEwAHOWd8uRv9c9CSMCUEyMwvIQ7ANOEXG6NsdQ==} + cpu: [x64] + os: [win32] + + '@anthropic-ai/claude-agent-sdk@0.3.158': + resolution: {integrity: sha512-Rht8Ui7HBsVdBCG6SYs9b+JmJWAVoDXPD2pWNVMSFrzyAS4nizwdz3HtUnAobFumgzbT3LbpWzHdLfUDu4gM4w==} + engines: {node: '>=18.0.0'} + peerDependencies: + '@anthropic-ai/sdk': '>=0.93.0' + '@modelcontextprotocol/sdk': ^1.29.0 + zod: ^4.0.0 + + '@anthropic-ai/sdk@0.100.1': + resolution: {integrity: sha512-RANcEe7LpiLczkKGOwoXOTuFdPhuubS0i4xaAKOMpcqc55YO0mukgxppV7eygx3DXNjxWT6RYOLPyOy0aIAmwg==} + hasBin: true + peerDependencies: + zod: ^3.25.0 || ^4.0.0 + peerDependenciesMeta: + zod: + optional: true + '@ark/schema@0.56.0': resolution: {integrity: sha512-ECg3hox/6Z/nLajxXqNhgPtNdHWC9zNsDyskwO28WinoFEnWow4IsERNz9AnXRhTZJnYIlAJ4uGn3nlLk65vZA==} @@ -5711,6 +5775,16 @@ packages: '@mjackson/node-fetch-server@0.2.0': resolution: {integrity: sha512-EMlH1e30yzmTpGLQjlFmaDAjyOeZhng1/XCd7DExR8PNAnG/G1tyruZxEoUe11ClnwGhGrtsdnyyUx1frSzjng==} + '@modelcontextprotocol/sdk@1.29.0': + resolution: {integrity: sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ==} + engines: {node: '>=18'} + peerDependencies: + '@cfworker/json-schema': ^4.1.1 + zod: ^3.25 || ^4.0 + peerDependenciesMeta: + '@cfworker/json-schema': + optional: true + '@mongodb-js/saslprep@1.4.11': resolution: {integrity: sha512-o9rAHc0IpIjuPSxRutWpE1F62x7n+4mVS4rCNHkzhIUMQcc18bb6xEq5wd2NdN0WjepIyXIppRshYI2kQDOZVA==} @@ -6604,6 +6678,9 @@ packages: '@speed-highlight/core@1.2.15': resolution: {integrity: sha512-BMq1K3DsElxDWawkX6eLg9+CKJrTVGCBAWVuHXVUV2u0s2711qiChLSId6ikYPfxhdYocLNt3wWwSvDiTvFabw==} + '@stablelib/base64@1.0.1': + resolution: {integrity: sha512-1bnPQqSxSuc3Ii6MhBysoWCg58j97aUjuCSZrGSmDxNqtytIi0k8utUenAwTZN4V5mXXYGsVUI9zeBqy+jBOSQ==} + '@standard-schema/spec@1.1.0': resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} @@ -6965,6 +7042,10 @@ packages: resolution: {integrity: sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==} engines: {node: '>= 0.6'} + accepts@2.0.0: + resolution: {integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==} + engines: {node: '>= 0.6'} + acorn-jsx-walk@2.0.0: resolution: {integrity: sha512-uuo6iJj4D4ygkdzd6jPtcxs8vZgDX9YFIkqczGImoypX2fQ4dVImmu3UzA4ynixCIMTrEOWW+95M2HuBaCEOVA==} @@ -7002,6 +7083,14 @@ packages: resolution: {integrity: sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA==} engines: {node: '>=8'} + ajv-formats@3.0.1: + resolution: {integrity: sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==} + peerDependencies: + ajv: ^8.0.0 + peerDependenciesMeta: + ajv: + optional: true + ajv@8.20.0: resolution: {integrity: sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==} @@ -7170,6 +7259,10 @@ packages: resolution: {integrity: sha512-3grm+/2tUOvu2cjJkvsIxrv/wVpfXQW4PsQHYm7yk4vfpu7Ekl6nEsYBoJUL6qDwZUx8wUhQ8tR2qz+ad9c9OA==} engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16} + body-parser@2.2.2: + resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} + engines: {node: '>=18'} + brace-expansion@1.1.15: resolution: {integrity: sha512-EwOCDEex4quD37XhqM3omwtMoJjr//isUZz1JopUNWms+4Z2ViyM/k1YIRePpoVNnQhENnxtFjLaxNHrT7xIUg==} @@ -7346,16 +7439,28 @@ packages: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} + content-disposition@1.1.0: + resolution: {integrity: sha512-5jRCH9Z/+DRP7rkvY83B+yGIGX96OYdJmzngqnw2SBSxqCFPd0w2km3s5iawpGX8krnwSGmF0FW5Nhr0Hfai3g==} + engines: {node: '>=18'} + content-type@1.0.5: resolution: {integrity: sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==} engines: {node: '>= 0.6'} + content-type@2.0.0: + resolution: {integrity: sha512-j/O/d7GcZCyNl7/hwZAb606rzqkyvaDctLmckbxLzHvFBzTJHuGEdodATcP3yIRoDrLHkIATJuvzbFlp/ki2cQ==} + engines: {node: '>=18'} + convert-source-map@2.0.0: resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==} cookie-signature@1.0.7: resolution: {integrity: sha512-NXdYc3dLr47pBkpUCHtKSwIOQXLVn8dZEuywboCOJY/osA0wFSLlSawr3KN8qXJEyX66FcONTH8EIlVuK0yyFA==} + cookie-signature@1.2.2: + resolution: {integrity: sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==} + engines: {node: '>=6.6.0'} + cookie@0.7.2: resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==} engines: {node: '>= 0.6'} @@ -7364,6 +7469,10 @@ packages: resolution: {integrity: sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==} engines: {node: '>=18'} + cors@2.8.6: + resolution: {integrity: sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==} + engines: {node: '>= 0.10'} + cross-spawn@7.0.6: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} @@ -7595,6 +7704,14 @@ packages: events-universal@1.0.1: resolution: {integrity: sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==} + eventsource-parser@3.1.0: + resolution: {integrity: sha512-kJezFj9YFAMLeORyi7aCLxLbD5/qWMQnoMVlVPyHIll7lgRJCc3JVln9Vgl9nwQi0YkMnhdGTMNn7CkRRAptMg==} + engines: {node: '>=18.0.0'} + + eventsource@3.0.7: + resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==} + engines: {node: '>=18.0.0'} + evlog@1.9.0: resolution: {integrity: sha512-Dzv4drz+MydyZlLok2ATc1O4WBBDEh0+mNl2Tk3NePdaHWgmvCYYovOQgXycxn7NOSv2acRqXHfUlbP6A3rdGQ==} peerDependencies: @@ -7627,10 +7744,20 @@ packages: resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==} engines: {node: '>=12.0.0'} + express-rate-limit@8.5.2: + resolution: {integrity: sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==} + engines: {node: '>= 16'} + peerDependencies: + express: '>= 4.11' + express@4.22.1: resolution: {integrity: sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==} engines: {node: '>= 0.10.0'} + express@5.2.1: + resolution: {integrity: sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==} + engines: {node: '>= 18'} + exsolve@1.0.8: resolution: {integrity: sha512-LmDxfWXwcTArk8fUEnOfSZpHOJ6zOMUJKOtFLFqJLoKJetuQG874Uc7/Kki7zFLzYybmZhp1M7+98pfMqeX8yA==} @@ -7644,6 +7771,9 @@ packages: fast-fifo@1.3.2: resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} + fast-sha256@1.3.0: + resolution: {integrity: sha512-n11RGP/lrWEFI/bWdygLxhI+pVeo1ZYIVwvvPkW7azl/rOy+F3HYRZ2K5zeE9mmkhQppyv9sQFx0JM9UabnpPQ==} + fast-string-truncated-width@3.0.3: resolution: {integrity: sha512-0jjjIEL6+0jag3l2XWWizO64/aZVtpiGE3t0Zgqxv0DPuxiMjvB3M24fCyhZUO4KomJQPj3LTSUnDP3GpdwC0g==} @@ -7672,6 +7802,10 @@ packages: resolution: {integrity: sha512-aA4RyPcd3badbdABGDuTXCMTtOneUCAYH/gxoYRTZlIJdF0YPWuGqiAsIrhNnnqdXGswYk6dGujem4w80UJFhg==} engines: {node: '>= 0.8'} + finalhandler@2.1.1: + resolution: {integrity: sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==} + engines: {node: '>= 18.0.0'} + find-cache-dir@3.3.2: resolution: {integrity: sha512-wXZV5emFEjrridIgED11OoUKLxiYjAcqot/NJdAkOhlJ+vGzwhOAfcG5OX1jP+S0PcjEn8bdMJv+g2jwQ3Onig==} engines: {node: '>=8'} @@ -7701,6 +7835,10 @@ packages: resolution: {integrity: sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==} engines: {node: '>= 0.6'} + fresh@2.0.0: + resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==} + engines: {node: '>= 0.8'} + fs-constants@1.0.0: resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} @@ -7863,6 +8001,10 @@ packages: resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==} engines: {node: '>=0.10.0'} + iconv-lite@0.7.2: + resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==} + engines: {node: '>=0.10.0'} + ieee754@1.2.1: resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==} @@ -7944,6 +8086,9 @@ packages: is-potential-custom-element-name@1.0.1: resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==} + is-promise@4.0.0: + resolution: {integrity: sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==} + isarray@2.0.5: resolution: {integrity: sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==} @@ -7970,6 +8115,9 @@ packages: resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==} hasBin: true + jose@6.2.3: + resolution: {integrity: sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==} + js-tokens@10.0.0: resolution: {integrity: sha512-lM/UBzQmfJRo9ABXbPWemivdCW8V2G8FHaHdypQaIy523snUjog0W71ayWXTjiR+ixeMyVHN2XcpnTd/liPg/Q==} @@ -7999,9 +8147,16 @@ packages: engines: {node: '>=6'} hasBin: true + json-schema-to-ts@3.1.1: + resolution: {integrity: sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==} + engines: {node: '>=16'} + json-schema-traverse@1.0.0: resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + json-schema-typed@8.0.2: + resolution: {integrity: sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==} + json-stable-stringify@1.3.0: resolution: {integrity: sha512-qtYiSSFlwot9XHtF9bD9c7rwKjr+RecWT//ZnPvSmEjpV5mmPOCN4j8UjY5hbjNkOwZ/jQv3J6R1/pL7RwgMsg==} engines: {node: '>= 0.4'} @@ -8179,12 +8334,20 @@ packages: resolution: {integrity: sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==} engines: {node: '>= 0.6'} + media-typer@1.1.0: + resolution: {integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==} + engines: {node: '>= 0.8'} + memory-pager@1.5.0: resolution: {integrity: sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==} merge-descriptors@1.0.3: resolution: {integrity: sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==} + merge-descriptors@2.0.0: + resolution: {integrity: sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==} + engines: {node: '>=18'} + methods@1.1.2: resolution: {integrity: sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==} engines: {node: '>= 0.6'} @@ -8201,6 +8364,10 @@ packages: resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==} engines: {node: '>= 0.6'} + mime-types@3.0.2: + resolution: {integrity: sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==} + engines: {node: '>=18'} + mime@1.6.0: resolution: {integrity: sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==} engines: {node: '>=4'} @@ -8352,6 +8519,10 @@ packages: resolution: {integrity: sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==} engines: {node: '>= 0.6'} + negotiator@1.0.0: + resolution: {integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==} + engines: {node: '>= 0.6'} + new-find-package-json@2.0.0: resolution: {integrity: sha512-lDcBsjBSMlj3LXH2v/FW3txlh2pYTjmbOXPYJD93HI5EwuLzI11tdHSIpUMmfq/IOsldj4Ps8M8flhm+pCK4Ew==} engines: {node: '>=12.22.0'} @@ -8402,6 +8573,10 @@ packages: engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0} deprecated: This package is no longer supported. + object-assign@4.1.1: + resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} + engines: {node: '>=0.10.0'} + object-hash@2.2.0: resolution: {integrity: sha512-gScRMn0bS5fH+IuwyIFgnh9zBdo4DV+6GhygmWM9HyNJSgS0hScp1f5vjtm7oIIOiT9trXrShAkLFSc2IqKNgw==} engines: {node: '>= 6'} @@ -8496,6 +8671,9 @@ packages: path-to-regexp@6.3.0: resolution: {integrity: sha512-Yhpw4T9C6hPpgPeA28us07OJeqZ5EzQTkbfwuhsUg0c237RomFoETJgmp2sa3F/41gfLE6G5cqcYwznmeEeOlQ==} + path-to-regexp@8.4.2: + resolution: {integrity: sha512-qRcuIdP69NPm4qbACK+aDogI5CBDMi1jKe0ry5rSQJz8JVLsC7jV8XpiJjGRLLol3N+R5ihGYcrPLTno6pAdBA==} + pathe@1.1.2: resolution: {integrity: sha512-whLdWMYL2TwI08hn8/ZqAbrVemu0LNaNNJZX73O6qaIdCTfXutsLhMkjdENX0qhsQ9uIimo4/aQOmXkoon2nDQ==} @@ -8592,6 +8770,10 @@ packages: resolution: {integrity: sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==} engines: {node: '>=12'} + pkce-challenge@5.0.1: + resolution: {integrity: sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==} + engines: {node: '>=16.20.0'} + pkg-dir@4.2.0: resolution: {integrity: sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==} engines: {node: '>=8'} @@ -8702,6 +8884,10 @@ packages: resolution: {integrity: sha512-s4VSOf6yN0rvbRZGxs8Om5CWj6seneMwK3oDb4lWDH0UPhWcxwOWw5+qk24bxq87szX1ydrwylIOp2uG1ojUpA==} engines: {node: '>= 0.8'} + raw-body@3.0.2: + resolution: {integrity: sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==} + engines: {node: '>= 0.10'} + rc9@3.0.1: resolution: {integrity: sha512-gMDyleLWVE+i6Sgtc0QbbY6pEKqYs97NGi6isHQPqYlLemPoO8dxQ3uGi0f4NiP98c+jMW6cG1Kx9dDwfvqARQ==} @@ -8865,6 +9051,10 @@ packages: engines: {node: '>=18.0.0', npm: '>=8.0.0'} hasBin: true + router@2.2.0: + resolution: {integrity: sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==} + engines: {node: '>= 18'} + safe-buffer@5.1.2: resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} @@ -8911,10 +9101,18 @@ packages: resolution: {integrity: sha512-VMbMxbDeehAxpOtWJXlcUS5E8iXh6QmN+BkRX1GARS3wRaXEEgzCcB10gTQazO42tpNIya8xIyNx8fll1OFPrg==} engines: {node: '>= 0.8.0'} + send@1.2.1: + resolution: {integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==} + engines: {node: '>= 18'} + serve-static@1.16.3: resolution: {integrity: sha512-x0RTqQel6g5SY7Lg6ZreMmsOzncHFU7nhnRWkKgWuMTu5NN0DR5oruckMqRvacAN9d5w6ARnRBXl9xhDCgfMeA==} engines: {node: '>= 0.8.0'} + serve-static@2.2.1: + resolution: {integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==} + engines: {node: '>= 18'} + set-blocking@2.0.0: resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} @@ -9041,6 +9239,9 @@ packages: stackback@0.0.2: resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==} + standardwebhooks@1.0.0: + resolution: {integrity: sha512-BbHGOQK9olHPMvQNHWul6MYlrRTAOKn03rOe4A8O3CLWhNf4YHBqq2HJKKC+sfqpxiBY52pNeesD6jIiLDz8jg==} + statuses@2.0.2: resolution: {integrity: sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==} engines: {node: '>= 0.8'} @@ -9201,6 +9402,9 @@ packages: resolution: {integrity: sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==} hasBin: true + ts-algebra@2.0.0: + resolution: {integrity: sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==} + ts-toolbelt@9.6.0: resolution: {integrity: sha512-nsZd8ZeNUzukXPlJmTBwUAuABDe/9qtVDelJeT/qW0ow3ZS3BsQJtNkan1802aM9Uf68/Y8ljw86Hu0h5IUW3w==} @@ -9278,6 +9482,10 @@ packages: resolution: {integrity: sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==} engines: {node: '>= 0.6'} + type-is@2.1.0: + resolution: {integrity: sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==} + engines: {node: '>= 18'} + typescript@5.9.3: resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} engines: {node: '>=14.17'} @@ -9730,6 +9938,11 @@ packages: zeptomatch@2.1.0: resolution: {integrity: sha512-KiGErG2J0G82LSpniV0CtIzjlJ10E04j02VOudJsPyPwNZgGnRKQy7I1R7GMyg/QswnE4l7ohSGrQbQbjXPPDA==} + zod-to-json-schema@3.25.2: + resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==} + peerDependencies: + zod: ^3.25.28 || ^4 + zod@3.25.76: resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==} @@ -9739,6 +9952,52 @@ snapshots: '@alloc/quick-lru@5.2.0': {} + '@anthropic-ai/claude-agent-sdk-darwin-arm64@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-darwin-x64@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-arm64-musl@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-arm64@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-x64-musl@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-x64@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-win32-arm64@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk-win32-x64@0.3.158': + optional: true + + '@anthropic-ai/claude-agent-sdk@0.3.158(@anthropic-ai/sdk@0.100.1(zod@3.25.76))(@modelcontextprotocol/sdk@1.29.0(zod@3.25.76))(zod@3.25.76)': + dependencies: + '@anthropic-ai/sdk': 0.100.1(zod@3.25.76) + '@modelcontextprotocol/sdk': 1.29.0(zod@3.25.76) + zod: 3.25.76 + optionalDependencies: + '@anthropic-ai/claude-agent-sdk-darwin-arm64': 0.3.158 + '@anthropic-ai/claude-agent-sdk-darwin-x64': 0.3.158 + '@anthropic-ai/claude-agent-sdk-linux-arm64': 0.3.158 + '@anthropic-ai/claude-agent-sdk-linux-arm64-musl': 0.3.158 + '@anthropic-ai/claude-agent-sdk-linux-x64': 0.3.158 + '@anthropic-ai/claude-agent-sdk-linux-x64-musl': 0.3.158 + '@anthropic-ai/claude-agent-sdk-win32-arm64': 0.3.158 + '@anthropic-ai/claude-agent-sdk-win32-x64': 0.3.158 + + '@anthropic-ai/sdk@0.100.1(zod@3.25.76)': + dependencies: + json-schema-to-ts: 3.1.1 + standardwebhooks: 1.0.0 + optionalDependencies: + zod: 3.25.76 + '@ark/schema@0.56.0': dependencies: '@ark/util': 0.56.0 @@ -10593,6 +10852,28 @@ snapshots: '@mjackson/node-fetch-server@0.2.0': {} + '@modelcontextprotocol/sdk@1.29.0(zod@3.25.76)': + dependencies: + '@hono/node-server': 1.19.11(hono@4.11.4) + ajv: 8.20.0 + ajv-formats: 3.0.1(ajv@8.20.0) + content-type: 1.0.5 + cors: 2.8.6 + cross-spawn: 7.0.6 + eventsource: 3.0.7 + eventsource-parser: 3.1.0 + express: 5.2.1 + express-rate-limit: 8.5.2(express@5.2.1) + hono: 4.11.4 + jose: 6.2.3 + json-schema-typed: 8.0.2 + pkce-challenge: 5.0.1 + raw-body: 3.0.2 + zod: 3.25.76 + zod-to-json-schema: 3.25.2(zod@3.25.76) + transitivePeerDependencies: + - supports-color + '@mongodb-js/saslprep@1.4.11': dependencies: sparse-bitfield: 3.0.3 @@ -11338,6 +11619,8 @@ snapshots: '@speed-highlight/core@1.2.15': {} + '@stablelib/base64@1.0.1': {} + '@standard-schema/spec@1.1.0': {} '@statsig/client-core@3.31.0': {} @@ -11672,6 +11955,11 @@ snapshots: mime-types: 2.1.35 negotiator: 0.6.3 + accepts@2.0.0: + dependencies: + mime-types: 3.0.2 + negotiator: 1.0.0 + acorn-jsx-walk@2.0.0: {} acorn-jsx@5.3.2(acorn@8.16.0): @@ -11708,6 +11996,10 @@ snapshots: indent-string: 4.0.0 optional: true + ajv-formats@3.0.1(ajv@8.20.0): + optionalDependencies: + ajv: 8.20.0 + ajv@8.20.0: dependencies: fast-deep-equal: 3.1.3 @@ -11879,6 +12171,20 @@ snapshots: transitivePeerDependencies: - supports-color + body-parser@2.2.2: + dependencies: + bytes: 3.1.2 + content-type: 1.0.5 + debug: 4.4.3 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + on-finished: 2.4.1 + qs: 6.15.1 + raw-body: 3.0.2 + type-is: 2.1.0 + transitivePeerDependencies: + - supports-color + brace-expansion@1.1.15: dependencies: balanced-match: 1.0.2 @@ -12072,16 +12378,27 @@ snapshots: dependencies: safe-buffer: 5.2.1 + content-disposition@1.1.0: {} + content-type@1.0.5: {} + content-type@2.0.0: {} + convert-source-map@2.0.0: {} cookie-signature@1.0.7: {} + cookie-signature@1.2.2: {} + cookie@0.7.2: {} cookie@1.1.1: {} + cors@2.8.6: + dependencies: + object-assign: 4.1.1 + vary: 1.1.2 + cross-spawn@7.0.6: dependencies: path-key: 3.1.1 @@ -12347,6 +12664,12 @@ snapshots: transitivePeerDependencies: - bare-abort-controller + eventsource-parser@3.1.0: {} + + eventsource@3.0.7: + dependencies: + eventsource-parser: 3.1.0 + evlog@1.9.0: {} exit-hook@2.2.1: {} @@ -12355,6 +12678,11 @@ snapshots: expect-type@1.3.0: {} + express-rate-limit@8.5.2(express@5.2.1): + dependencies: + express: 5.2.1 + ip-address: 10.2.0 + express@4.22.1: dependencies: accepts: 1.3.8 @@ -12391,6 +12719,39 @@ snapshots: transitivePeerDependencies: - supports-color + express@5.2.1: + dependencies: + accepts: 2.0.0 + body-parser: 2.2.2 + content-disposition: 1.1.0 + content-type: 1.0.5 + cookie: 0.7.2 + cookie-signature: 1.2.2 + debug: 4.4.3 + depd: 2.0.0 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + finalhandler: 2.1.1 + fresh: 2.0.0 + http-errors: 2.0.1 + merge-descriptors: 2.0.0 + mime-types: 3.0.2 + on-finished: 2.4.1 + once: 1.4.0 + parseurl: 1.3.3 + proxy-addr: 2.0.7 + qs: 6.15.1 + range-parser: 1.2.1 + router: 2.2.0 + send: 1.2.1 + serve-static: 2.2.1 + statuses: 2.0.2 + type-is: 2.1.0 + vary: 1.1.2 + transitivePeerDependencies: + - supports-color + exsolve@1.0.8: {} extend-shallow@2.0.1: @@ -12401,6 +12762,8 @@ snapshots: fast-fifo@1.3.2: {} + fast-sha256@1.3.0: {} + fast-string-truncated-width@3.0.3: {} fast-string-width@3.0.2: @@ -12431,6 +12794,17 @@ snapshots: transitivePeerDependencies: - supports-color + finalhandler@2.1.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + on-finished: 2.4.1 + parseurl: 1.3.3 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + find-cache-dir@3.3.2: dependencies: commondir: 1.0.1 @@ -12455,6 +12829,8 @@ snapshots: fresh@0.5.2: {} + fresh@2.0.0: {} + fs-constants@1.0.0: {} fs-minipass@2.1.0: @@ -12630,6 +13006,10 @@ snapshots: safer-buffer: 2.1.2 optional: true + iconv-lite@0.7.2: + dependencies: + safer-buffer: 2.1.2 + ieee754@1.2.1: {} ignore@7.0.5: {} @@ -12660,8 +13040,7 @@ snapshots: interpret@3.1.1: {} - ip-address@10.2.0: - optional: true + ip-address@10.2.0: {} ipaddr.js@1.9.1: {} @@ -12690,6 +13069,8 @@ snapshots: is-potential-custom-element-name@1.0.1: {} + is-promise@4.0.0: {} + isarray@2.0.5: {} isbot@5.1.40: {} @@ -12711,6 +13092,8 @@ snapshots: jiti@2.7.0: {} + jose@6.2.3: {} + js-tokens@10.0.0: {} js-tokens@4.0.0: {} @@ -12750,8 +13133,15 @@ snapshots: jsesc@3.1.0: {} + json-schema-to-ts@3.1.1: + dependencies: + '@babel/runtime': 7.29.2 + ts-algebra: 2.0.0 + json-schema-traverse@1.0.0: {} + json-schema-typed@8.0.2: {} + json-stable-stringify@1.3.0: dependencies: call-bind: 1.0.9 @@ -12920,10 +13310,14 @@ snapshots: media-typer@0.3.0: {} + media-typer@1.1.0: {} + memory-pager@1.5.0: {} merge-descriptors@1.0.3: {} + merge-descriptors@2.0.0: {} + methods@1.1.2: {} mime-db@1.52.0: {} @@ -12934,6 +13328,10 @@ snapshots: dependencies: mime-db: 1.52.0 + mime-types@3.0.2: + dependencies: + mime-db: 1.54.0 + mime@1.6.0: {} mimic-function@5.0.1: {} @@ -13111,6 +13509,8 @@ snapshots: negotiator@0.6.4: {} + negotiator@1.0.0: {} + new-find-package-json@2.0.0: dependencies: debug: 4.4.3 @@ -13179,6 +13579,8 @@ snapshots: set-blocking: 2.0.0 optional: true + object-assign@4.1.1: {} + object-hash@2.2.0: {} object-inspect@1.13.4: {} @@ -13251,6 +13653,8 @@ snapshots: path-to-regexp@6.3.0: {} + path-to-regexp@8.4.2: {} + pathe@1.1.2: {} pathe@2.0.3: {} @@ -13317,6 +13721,8 @@ snapshots: picomatch@4.0.4: {} + pkce-challenge@5.0.1: {} + pkg-dir@4.2.0: dependencies: find-up: 4.1.0 @@ -13438,6 +13844,13 @@ snapshots: iconv-lite: 0.4.24 unpipe: 1.0.0 + raw-body@3.0.2: + dependencies: + bytes: 3.1.2 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + unpipe: 1.0.0 + rc9@3.0.1: dependencies: defu: 6.1.7 @@ -13642,6 +14055,16 @@ snapshots: '@rollup/rollup-win32-x64-msvc': 4.59.0 fsevents: 2.3.3 + router@2.2.0: + dependencies: + debug: 4.4.3 + depd: 2.0.0 + is-promise: 4.0.0 + parseurl: 1.3.3 + path-to-regexp: 8.4.2 + transitivePeerDependencies: + - supports-color + safe-buffer@5.1.2: {} safe-buffer@5.2.1: {} @@ -13689,6 +14112,22 @@ snapshots: transitivePeerDependencies: - supports-color + send@1.2.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + fresh: 2.0.0 + http-errors: 2.0.1 + mime-types: 3.0.2 + ms: 2.1.3 + on-finished: 2.4.1 + range-parser: 1.2.1 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + serve-static@1.16.3: dependencies: encodeurl: 2.0.0 @@ -13698,6 +14137,15 @@ snapshots: transitivePeerDependencies: - supports-color + serve-static@2.2.1: + dependencies: + encodeurl: 2.0.0 + escape-html: 1.0.3 + parseurl: 1.3.3 + send: 1.2.1 + transitivePeerDependencies: + - supports-color + set-blocking@2.0.0: optional: true @@ -13873,6 +14321,11 @@ snapshots: stackback@0.0.2: {} + standardwebhooks@1.0.0: + dependencies: + '@stablelib/base64': 1.0.1 + fast-sha256: 1.3.0 + statuses@2.0.2: {} std-env@3.10.0: {} @@ -14035,6 +14488,8 @@ snapshots: tree-kill@1.2.2: {} + ts-algebra@2.0.0: {} + ts-toolbelt@9.6.0: {} tsconfck@3.1.6(typescript@5.9.3): @@ -14108,6 +14563,12 @@ snapshots: media-typer: 0.3.0 mime-types: 2.1.35 + type-is@2.1.0: + dependencies: + content-type: 2.0.0 + media-typer: 1.1.0 + mime-types: 3.0.2 + typescript@5.9.3: {} unconfig-core@7.5.0: @@ -14488,4 +14949,8 @@ snapshots: grammex: 3.1.12 graphmatch: 1.1.1 + zod-to-json-schema@3.25.2(zod@3.25.76): + dependencies: + zod: 3.25.76 + zod@3.25.76: {} diff --git a/projects/drive-judge-harness/slices/claude-runtime/plan.md b/projects/drive-judge-harness/slices/claude-runtime/plan.md new file mode 100644 index 0000000000..c3ee0bf977 --- /dev/null +++ b/projects/drive-judge-harness/slices/claude-runtime/plan.md @@ -0,0 +1,29 @@ +# Plan: claude-runtime (TML-2759) + +Test-first. The Claude SDK is reached only via `claude-adapter.ts`'s lazy import (mirroring `sdk-adapter.ts`); all mapping logic lives in the no-SDK `claude-events.ts` so it's unit-testable with the SDK absent. Built on branch `tml-2757-run-fidelity` (PR #657), on top of the run-fidelity commits. + +## Dispatches + +### D1 — `claude-events.ts`: pure mappers + extraction (test-first) +- **Outcome:** Claude message/result shapes map to the harness's `RunStreamEvent` + a rich outcome, with no SDK import. +- Implement `usageFromAssistant`, `streamEventFromMessage`, `outcomeFromResult` (→ `{status,runId,tokens,durationMs,costUsd,numTurns}`) over `unknown`. Map `cache_creation_input_tokens`→`cacheWriteTokens`, `cache_read_input_tokens`→`cacheReadTokens`; `session_id`→`runId`; `subtype==='success'`→`finished`. +- Tests (`test/claude-events.test.ts`): real `SDKResultMessage` (success + an `error_*` subtype) + a real `assistant` message; assert token totals, `cost_usd`, `wall_clock_ms` (`duration_ms`), `num_turns`, `run_id`; degrade on non-records. SDK not installed. +- **Builds on:** run-fidelity (`usage.ts`, the seam). **Hands to:** D2. + +### D2 — `claude-adapter.ts` + seam/manifest + runtime selection (test-first) +- **Outcome:** the harness runs on Claude by default and records tokens/cost/turns; `--runtime cursor` still works. +- `RunOutcome` gains `tokens`/`costUsd`/`numTurns` (Cursor adapter sets null). `run-one-brief.ts`: prefer `outcome.tokens` else `accumulateUsage`; populate `cost_usd`/`num_turns`/`wall_clock_ms`; runtime selection + per-runtime key gating; `defaultCreateAgent(runtime)`. `manifest.ts`: add `runtime`/`cost_usd`/`num_turns`. `run-arm.ts` + `run-one-brief.ts` CLIs: `--runtime` (default claude), `--max-budget-usd`. +- `claude-adapter.ts`: `query()` with `cwd`/`settingSources:['project']`/`skills:'all'`/`permissionMode:'bypassPermissions'`/`allowDangerouslySkipPermissions:true`/`model`/`maxBudgetUsd`; buffer the result for `wait()`. +- Tests: injected `createAgent` returning a Claude-shaped outcome → manifest has `runtime:'claude'`, non-null `tokens`/`cost_usd`/`num_turns`; a `--runtime cursor` selection test; key-gating per runtime. +- **Builds on:** D1. **Hands to:** D3 (orchestrator). + +### D3 — install + docs + live smoke + gates + PR (orchestrator) +- Install `@anthropic-ai/claude-agent-sdk` (`pnpm add -w -D`); handle any build-script/native hiccups as with `@cursor/sdk`. +- Wire `test/claude-events.test.ts` into `test:scripts`. +- Docs: SKILL.md "Runtimes" section (claude default / cursor secondary, selection, `maxBudgetUsd`); scope the token-gap note to the Cursor adapter in SKILL.md + KNOWN-ISSUES. +- Live smoke on `claude-haiku-4-5` iff `ANTHROPIC_API_KEY` present (else gated follow-up note). +- Gates: `pnpm test:scripts`, biome, transient-id scan. Update PR #657 title/body to "faithful + decoupled runs" (refs TML-2757 + TML-2759). Commit signed-off, push. +- **Builds on:** D2. + +## Sequencing +Serial: D1 (mappers) → D2 (adapter + wiring consume them) → D3 (install/docs/gates). Target 3 dispatches; D1+D2 delegated to one implementer, D3 by the orchestrator. diff --git a/projects/drive-judge-harness/slices/claude-runtime/spec.md b/projects/drive-judge-harness/slices/claude-runtime/spec.md new file mode 100644 index 0000000000..4d828c3b7e --- /dev/null +++ b/projects/drive-judge-harness/slices/claude-runtime/spec.md @@ -0,0 +1,88 @@ +# Slice: claude-runtime + +_Parent project `projects/drive-judge-harness/`. Outcome this slice contributes: the harness is **decoupled from Cursor** — it runs the Drive orchestrator on Anthropic's Claude Agent SDK by default, which reports real token usage, USD cost, and wall-clock natively (the signal `@cursor/sdk`'s local runtime never gave us). The Cursor adapter stays as a runtime-selectable secondary. Delivered alongside the run-fidelity fixes on the same branch/PR (#657)._ + +## At a glance + +A live run now records tokens + dollars + wall-clock, because the runtime reports them: + +```jsonc +{ "runtime": "claude", "model": "claude-haiku-4-5", "status": "finished", + "run_id": "", "agent_id": null, + "tokens": { "inputTokens": 33, "outputTokens": 904, "cacheReadTokens": 230827, "cacheWriteTokens": 53995, "totalTokens": 285759 }, + "cost_usd": 0.1839242, "num_turns": 9, "wall_clock_ms": 16025, "notes": [] } +``` + +The Cursor runtime stays available via `--runtime cursor`; its token gap (documented in the run-fidelity work) is now scoped to that adapter. + +## Chosen design + +The Cursor coupling lives in exactly one module behind a seam that already exists: `run-one-brief.ts` defines `CreateAgent` / `OrchestratorRun` / `RunOutcome`; `sdk-adapter.ts` is the only `@cursor/sdk` importer. This slice adds a **second adapter** over the same seam. + +Ground-truth Claude Agent SDK shapes (`@anthropic-ai/claude-agent-sdk`, confirmed from the cost-tracking + TS-reference docs): +- `query({ prompt, options })` returns an async iterable of messages. +- Per-`assistant` message: nested `message.usage` (`input_tokens`, `output_tokens`, `cache_creation_input_tokens`, `cache_read_input_tokens`) + `message.id`. +- Terminal `result` message (`SDKResultMessage`): `subtype` (`success` | `error_*`), cumulative `usage` (same fields), `total_cost_usd`, `duration_ms`, `num_turns`, `session_id`, `result`. + +### 1. `claude-events.ts` — pure mappers (no SDK import) + +Mirror of `sdk-events.ts`, for the Claude shapes. Operates over `unknown`; imports nothing from the SDK so it's unit-testable with the SDK absent. Exports: +- `usageFromAssistant(msg) -> TurnUsage | null` — maps `message.usage` (`cache_creation_input_tokens`→`cacheWriteTokens`, `cache_read_input_tokens`→`cacheReadTokens`). +- `streamEventFromMessage(msg) -> RunStreamEvent` — `assistant` with usage → `turn-ended`; else `other`. +- `outcomeFromResult(msg) -> { status; runId; tokens; durationMs; costUsd; numTurns } | null` — only for `type: 'result'`. `subtype === 'success'` → `finished`, else `error`; `session_id` → `runId`; cumulative `usage` → `TokenTotals`; `total_cost_usd` → `costUsd`; `duration_ms` → `durationMs`; `num_turns` → `numTurns`. Degrades on non-records. + +### 2. `claude-adapter.ts` — the only Claude-SDK importer (lazy) + +Implements `CreateAgent` over `query()`. Because `query()` is one generator (not split stream/wait), the adapter iterates it inside `stream()`, yields `turn-ended` events from per-assistant usage, captures the terminal `result` message, and returns it from `wait()` (run-one-brief drains the stream before calling `wait()`, so the result is available). `query()` options for an **unattended, skill-aware** orchestrator run: +- `cwd: runDir` (the prepared checkout — its `.claude/skills/` are the injected bundle) +- `settingSources: ['project']` (loads `.claude/skills/`, `.claude/agents/`, `CLAUDE.md` from the checkout) +- `skills: 'all'` (auto-enables the `Skill` tool) +- `permissionMode: 'bypassPermissions'` + `allowDangerouslySkipPermissions: true` (no interactive prompts) +- `model` (the pinned model id) +- `maxBudgetUsd` when provided (hard per-run dollar cap — aborts with `error_max_budget_usd`) + +### 3. Seam + manifest extensions + +- `RunOutcome` gains `tokens: TokenTotals | null`, `costUsd: number | null`, `numTurns: number | null` (Cursor adapter sets these `null`; tokens still flow via per-turn accumulation there). +- `run-one-brief.ts`: prefer `outcome.tokens` when present, else fall back to `accumulateUsage(usageUpdates)`. Populate `cost_usd` / `num_turns` / `wall_clock_ms` from the outcome. The null-token note (from the run-fidelity work) fires only when tokens are genuinely null. +- `RunManifest` gains `runtime: 'claude' | 'cursor'`, `cost_usd: number | null`, `num_turns: number | null`. +- **Runtime selection:** `RunOneBriefConfig`/`RunArmConfig` gain `runtime: 'claude' | 'cursor'` (default `'claude'`) and optional `maxBudgetUsd`. `defaultCreateAgent(runtime)` lazily imports the matching adapter. The gate's `apiKeyPresent` is computed against the runtime's key (`ANTHROPIC_API_KEY` for claude, `CURSOR_API_KEY` for cursor). CLI gains `--runtime ` (default claude) and `--max-budget-usd `. + +## Coherence rationale + +One reviewer holds it in one sitting: a second adapter behind an existing seam, plus the manifest fields the new runtime can finally populate. It's entangled with the run-fidelity work on the same branch — both are "make the recorded run faithful," and this slice is what turns the token gap that work documented into a captured signal. Rolls forward as: new pure module + new adapter + additive outcome/manifest fields + a runtime selector. No production package touched. + +## Scope + +**In:** `claude-events.ts` (+ tests with real result/assistant fixtures); `claude-adapter.ts` (lazy, sole Claude-SDK importer); `RunOutcome`/`RunManifest` additions; runtime selection + key-gating + CLI flags in `run-one-brief.ts` and `run-arm.ts`; install `@anthropic-ai/claude-agent-sdk`; SKILL.md runtimes section + scope the token-gap doc to the Cursor adapter; new test wired into `test:scripts`. Delivered on branch `tml-2757-run-fidelity` / PR #657. + +**Out:** removing the Cursor adapter (kept as secondary, operator decision). The A/B loop / aggregation / CI gate (TML-2737). Judge calibration (TML-2736) and corpus generation (real-dollar, operator-gated). + +## Pre-investigated edge cases + +| Edge case | Disposition | Notes | +|---|---|---| +| `query()` is one generator, not stream+wait | Drove the adapter shape | Iterate in `stream()`, stash the `result` message for `wait()`. | +| Claude reports cumulative usage on `result`, not just per-turn | `RunOutcome.tokens` | run-one-brief prefers outcome tokens; per-turn accumulation stays the Cursor path. | +| No `agent_id` concept in Claude SDK | `agent_id: null`, `session_id`→`run_id` | The session id is the run identifier. | +| Unattended run hitting a permission prompt | `bypassPermissions` + `allowDangerouslySkipPermissions` | Required for autonomous orchestrator runs. | +| Runaway cost during calibration | `maxBudgetUsd` cap | Aborts with `error_max_budget_usd`; recorded as an error run with usage-so-far. | +| `@anthropic-ai/claude-agent-sdk` not installed at test time | Lazy import behind the gate | `claude-events.ts` has no SDK import; tests never load the adapter. | + +## Slice-specific done conditions + +- [ ] A test feeds a real `SDKResultMessage` (success + an `error_*` subtype) through `claude-events.ts` and asserts `tokens`, `cost_usd`, `wall_clock_ms`, `num_turns`, `run_id` extraction — with the SDK not installed. +- [ ] `--runtime cursor` still produces a Cursor-runtime manifest (selection works both ways). +- [ ] A live smoke run on `claude-haiku-4-5` records non-null `tokens` + `cost_usd` **iff** `ANTHROPIC_API_KEY` is present; otherwise this is a gated follow-up. + +## Open Questions + +1. **Subagent token attribution.** Claude's `usage` aggregates orchestrator + subagents into one run total (per-subagent breakdown is an open SDK request). Working position: the run total is exactly what we want for the efficiency metric; per-subagent attribution is not needed for this slice. + +## References + +- Parent: `projects/drive-judge-harness/spec.md`; sibling run-fidelity slice (same branch). +- Spike: `projects/drive-judge-harness/spikes/2026-05-31-sdk-token-usage-retrieval.md`. +- Linear: [TML-2759](https://linear.app/prisma-company/issue/TML-2759) (related TML-2757, blocks TML-2737). +- SDK docs: [cost-tracking](https://code.claude.com/docs/en/agent-sdk/cost-tracking), [TS reference](https://code.claude.com/docs/en/agent-sdk/typescript), [skills](https://code.claude.com/docs/en/agent-sdk/skills). +- Seam: `skills-contrib/drive-judge-harness/{run-one-brief,sdk-adapter,sdk-events,run-arm,manifest,usage}.ts`. diff --git a/projects/drive-judge-harness/slices/run-fidelity/plan.md b/projects/drive-judge-harness/slices/run-fidelity/plan.md new file mode 100644 index 0000000000..5689ab341c --- /dev/null +++ b/projects/drive-judge-harness/slices/run-fidelity/plan.md @@ -0,0 +1,37 @@ +# Plan: run-fidelity (TML-2757) + +Test-first throughout. The live SDK is reached only via `sdk-adapter.ts`'s dynamic import; all new logic lives in no-SDK-import modules so it's unit-testable with `@cursor/sdk` absent. Spike `2026-05-31-sdk-token-usage-retrieval.md` is committed in dispatch 1. + +## Dispatches + +### D1 — `sdk-events.ts`: pure mappers + real-shape extraction (test-first) +- **Outcome:** message/outcome mapping lives in a no-SDK module, with `agent_id` and `durationMs` extracted from the **real captured shapes**. +- Move `extractText` / `toStreamEvent` / `adaptOutcome` (and the now-dead `extractUsage`) out of `sdk-adapter.ts` into new `sdk-events.ts` (imports nothing from the SDK; operates over `unknown`). Add `agentIdFromMessage`, `outcomeFromResult` (→ `{status,runId,durationMs}`), `streamEventFromMessage`. +- Tests (`test/sdk-events.test.ts`): feed the real `status`/`assistant`/outcome fixtures from the spike; assert `agent_id`, `durationMs`, stream mapping. Runs with the SDK uninstalled. +- `sdk-adapter.ts` imports the mappers (no behaviour change). +- Commit the spike artifact here. +- **Builds on:** merged run-setup. **Hands to:** D2. + +### D2 — capture agent_id + wall-clock end-to-end (test-first) +- **Outcome:** a finished run records the real `agent_id` and `wall_clock_ms`. +- `run-one-brief.ts`: `RunOutcome` gains `durationMs: number | null`; adapter captures `agent_id` from the first stream message carrying one and returns it from `wait()`. +- `manifest.ts`: add `wall_clock_ms`; add the token-unavailable note when `tokens` is null on a finished live run. `run-arm.ts` threads `wall_clock_ms` into the enriched manifest. +- Tests: outcome→manifest mapping populates `agent_id` + `wall_clock_ms`; null-token note present. +- **Builds on:** D1. **Hands to:** D3. + +### D3 — `collect-run` run-scoping (test-first) +- **Outcome:** `collectRun` returns only traces emitted during the run. +- `prepare-run.ts`: snapshot `*.jsonl` under `runDir` after the baseline commit → `PreparedRun.preexistingTracePaths`. +- `collect-run.ts`: exclude `preexistingTracePaths`; `agent_id` match over the remainder. +- Tests: baseline-committed trace + run-emitted trace → only the latter returned (cover a gitignored-path trace). +- **Builds on:** D2. **Hands to:** D4. + +### D4 — docs + gates + PR +- **Outcome:** token gap documented; suite green; PR open. +- SKILL.md / KNOWN-ISSUES: token gap (link spike) + wall-clock-as-primary note. +- Wire new tests into `test:scripts`; run `pnpm -w typecheck`, `pnpm -w lint`, `pnpm -w test:scripts`; fix fallout. +- Stage explicitly, sign off, push to `tml-2757-run-fidelity`, open PR (create-pr skill). +- **Builds on:** D3. + +## Sequencing +Serial: D1 unlocks testability, D2 consumes the extractors, D3 is independent of D2 but shares the manifest touch (sequence after to avoid conflict), D4 closes. Target 4 dispatches. diff --git a/projects/drive-judge-harness/slices/run-fidelity/spec.md b/projects/drive-judge-harness/slices/run-fidelity/spec.md new file mode 100644 index 0000000000..d740089f02 --- /dev/null +++ b/projects/drive-judge-harness/slices/run-fidelity/spec.md @@ -0,0 +1,76 @@ +# Slice: run-fidelity + +_Parent project `projects/drive-judge-harness/`. Outcome this slice contributes: the harness records a **faithful** run — correct `agent_id`, a real wall-clock signal, and a trace set scoped to what the run actually emitted — so the corpus the judge calibrates against and the A/B engine ranks on isn't polluted or blank. Fixes the three fidelity defects the first live `run-arm` exposed._ + +## At a glance + +The first live run (composer-2.5, i12-halt) proved the pipeline but mis-recorded the run: `agent_id: null`, `tokens` all-zero, and `collected_trace_paths` containing 5 pre-existing committed traces from the base checkout plus 1 real one. This slice fixes the recordable defects and honestly documents the one that isn't recordable: + +- **`agent_id`** is read from the stream `status` message (where the local runtime actually puts it), not the `wait()` outcome. +- **Wall-clock** (`durationMs` from the outcome) is captured as `wall_clock_ms` — the primary Tier-2 efficiency metric, since tokens are unavailable. +- **`collect-run`** returns only traces *emitted during the run*, not every schema-valid `.jsonl` in the checkout. +- **Tokens** stay `null` for local runs with an explicit note + documented SDK limitation (spike `2026-05-31-sdk-token-usage-retrieval.md`). + +## Chosen design + +Ground-truth shapes from the spike probe (`@cursor/sdk@1.0.15`, local runtime): +- stream `status` → `{ type:"status", agent_id, run_id, status }` +- stream `assistant` → `{ type:"assistant", agent_id, run_id, message }` +- outcome (`wait()`) → `{ id, status, result, model, durationMs }` (no `agent_id`, no tokens) + +### 1. `sdk-events.ts` — extract the pure mappers (no SDK import) + +Today the message/outcome mappers (`extractUsage`, `extractText`, `adaptOutcome`, `toStreamEvent`) live inside `sdk-adapter.ts`, which `import`s `@cursor/sdk` at module top — so they can't be unit-tested without the SDK installed. Move them into a new **`sdk-events.ts`** that imports nothing from the SDK and operates over `unknown`. `sdk-adapter.ts` imports them. This is what lets the fixes be test-first while preserving the live-execution gate (SDK reached only via `sdk-adapter.ts`'s dynamic import). + +`sdk-events.ts` exports pure functions, unit-tested against the **real captured shapes**: +- `streamEventFromMessage(msg) -> RunStreamEvent` — maps `status`/`assistant` (real shapes) and keeps the `turn-ended` branch for the cloud runtime (still valid if ever used). +- `agentIdFromMessage(msg) -> string | null` — reads snake_case `agent_id`. +- `outcomeFromResult(raw) -> { status, runId, durationMs }` — reads `id`→runId, `status`, `durationMs` (number|null). + +### 2. `run-one-brief.ts` — capture agent_id + wall-clock + +`RunOutcome` gains `durationMs: number | null`. The adapter captures `agent_id` from the **first stream message that carries one** (run-one-brief drains the stream before `wait()`, so it's available), and `wait()` returns it as `agentId`. `durationMs` flows from `outcomeFromResult`. No behaviour change to the dry-run/gate paths. + +### 3. `manifest.ts` — wall-clock + honest token note + +Add `wall_clock_ms: number | null` (from `durationMs`). When `tokens` is `null` on a *finished live* run, append a note: `"tokens unavailable: @cursor/sdk local runtime emits no usage events (see spike 2026-05-31)"`. `tokens` field stays (null for local). + +### 4. `collect-run.ts` — scope to run-emitted traces + +`PreparedRun` gains `preexistingTracePaths: string[]` — the set of `*.jsonl` present under `runDir` immediately after `prepareRun`'s baseline commit (the base checkout's committed traces). `collectRun` excludes that set, so `tracePaths` contains only traces the run produced. This is deterministic (no mtime/clock reliance) and robust to gitignored trace locations (e.g. `wip/drive-trace/`, where the real trace landed). `agent_id` matching then runs over the run-emitted set only. + +## Coherence rationale + +One reviewer holds it in one sitting: every change serves "record the run faithfully," and they're entangled — the `agent_id` fix is what makes `collect-run`'s matching work, the mapper extraction is what makes both testable, and the wall-clock capture is the efficiency metric that stands in for the tokens the SDK won't give us. Rolls back as one unit (one new pure module + additive manifest/outcome fields + a `collect-run` scoping change). Touches no production package. + +## Scope + +**In:** new `sdk-events.ts` (+ tests with real-shape fixtures); `sdk-adapter.ts` (import the mappers, capture stream `agent_id`); `run-one-brief.ts` (`RunOutcome.durationMs`, agent_id wiring); `manifest.ts` (`wall_clock_ms` + token note); `collect-run.ts` + `prepare-run.ts` (`preexistingTracePaths` snapshot + exclusion); `run-arm.ts` (thread `wall_clock_ms` into the enriched manifest); the spike artifact; SKILL.md / KNOWN-ISSUES note on the token gap; new tests wired into `test:scripts`. + +**Out:** a non-SDK token source (Cursor admin/usage API, CLI telemetry) — deferred, out of scope (spike decision). The k=N A/B loop, aggregation, CI gate — TML-2737. The judge — TML-2736. + +## Pre-investigated edge cases + +| Edge case | Disposition | Notes | +|---|---|---| +| Local runtime emits no usage event | Documented, not fixed | Confirmed by spike; `tokens: null` + note is the honest record. | +| Real trace landed in gitignored `wip/drive-trace/` | Drove the design | Snapshot-exclusion (not git-diff) is why scoping works for gitignored traces. | +| `agent_id` present on stream but not outcome | Core of the fix | Capture from the stream message, not `wait()`. | +| Multiple run-emitted traces remain after exclusion | Matching handles it | `agent_id` match, else newest, over the run-emitted set. | + +## Slice-specific done conditions + +- [ ] A test feeds the **real captured** `status`/`assistant`/outcome shapes (from the spike) through `sdk-events.ts` and asserts `agent_id` + `durationMs` extraction — with `@cursor/sdk` not installed. +- [ ] A `collect-run` test with a baseline-committed trace + a run-emitted trace asserts only the latter is returned. + +## Open Questions + +1. **Snapshot `preexistingTracePaths` in `prepare-run` vs re-scan in `collect-run`?** Working position: snapshot in `prepare-run` (deterministic, captures the exact pre-run state) and pass it through `PreparedRun`. Re-scanning in `collect-run` would race any late base writes. + +## References + +- Parent project: `projects/drive-judge-harness/spec.md` +- Spike: `projects/drive-judge-harness/spikes/2026-05-31-sdk-token-usage-retrieval.md` +- Linear: [TML-2757](https://linear.app/prisma-company/issue/TML-2757) (blocks TML-2737) +- Surfaces: `skills-contrib/drive-judge-harness/{sdk-adapter,run-one-brief,manifest,collect-run,prepare-run,run-arm}.ts` +- First-run evidence: manifest at `run-arm-i12-…/run-manifest.json` (agent_id null, tokens 0, polluted trace list) diff --git a/projects/drive-judge-harness/spikes/2026-05-31-sdk-token-usage-retrieval.md b/projects/drive-judge-harness/spikes/2026-05-31-sdk-token-usage-retrieval.md new file mode 100644 index 0000000000..34ea18bf96 --- /dev/null +++ b/projects/drive-judge-harness/spikes/2026-05-31-sdk-token-usage-retrieval.md @@ -0,0 +1,26 @@ +# Spike: can per-run token usage be retrieved from `@cursor/sdk` for a local run? + +**Date:** 2026-05-31 · **Trigger:** the first live `run-arm` (composer-2.5, i12-halt) returned `tokens: {all zero}`. **Question:** is the token signal — our stated #1 efficiency metric after correctness — obtainable from the SDK for a *local-runtime* run, via the stream, the run outcome, the `analytics` surface, or the cloud-API `getRun`? + +## Answer + +**No. Token usage is not retrievable via the `@cursor/sdk` public surface for local runs, by any path.** Wall-clock (`durationMs`) is available and becomes the primary efficiency metric; `tokens` is honestly `null` from the runtime. + +## Evidence (`@cursor/sdk@1.0.15`) + +A throwaway probe spawned a trivial local run and dumped every stream message + the `wait()` outcome: + +- **Stream messages** — only two types across the whole run: `status` `{ type, agent_id, run_id, status }` and `assistant` `{ type, agent_id, run_id, message }`. **No `turnEnded` / `usage` event is emitted by the local runtime.** (The SDK *does* define a `usage: { inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens }` schema, but it rides on a `turnEnded` update that only the **cloud** runtime streams.) +- **Run outcome** (`wait()`) — `{ id, status, result, model, durationMs }`. Carries wall-clock (`durationMs`), no tokens. `agent_id` is **not** here; it is on the stream messages. +- **`analytics.d.ts`** — emit-only outbound telemetry (`trackSdkRunCreated/Completed/SendLatency`, `flushSdkAnalytics`). No read-back API. The event props (`SdkRunCreatedProps`, `SdkRunCompletedProps`, `SdkRunSendLatencyProps`) carry `turn_count`, latency, `end_reason` — **no token counts**. +- **`cloud-api-client` `getRun({agentId,runId}) → V1Run`** — `{ id, agentId, status, createdAt, updatedAt, durationMs?, result?, git? }`. **No tokens.** `RunResultMetadata` and `executor-types.d.ts` have zero token/usage/cost fields. (Also a cloud-agent query; a local run is not necessarily registered there.) + +## Decision (re-route) + +Proceed on **option (d)**: + +- Capture `durationMs` (wall-clock) from the run outcome → the primary Tier-2 efficiency metric for local runs. +- `tokens` stays `null` for local runs, with an explicit manifest note + a documented SDK limitation (consumption gotcha). Not a bug in our extraction — there is nothing to extract. +- A future token source must come from **outside** the SDK (a Cursor admin/usage API, or CLI-internal telemetry). Out of scope for the fix slice. + +Companion clean fixes (same slice): capture `agent_id` from the stream `status` message; scope `collect-run` to traces emitted *during* the run (exclude baseline-committed traces). diff --git a/skills-contrib/drive-judge-harness/KNOWN-ISSUES.md b/skills-contrib/drive-judge-harness/KNOWN-ISSUES.md index 08d8abff2d..1a9d53e776 100644 --- a/skills-contrib/drive-judge-harness/KNOWN-ISSUES.md +++ b/skills-contrib/drive-judge-harness/KNOWN-ISSUES.md @@ -52,3 +52,21 @@ The token-usage signal this harness needs comes from `TurnEndedUpdate.usage`, wh - read the per-turn `usage` field through a small, explicitly-bounded structural view in `sdk-adapter.ts` (guarded at runtime; no bare casts) rather than a fabricated full mirror of the SDK's types. When upstream ships resolvable types, replace that structural view with the real `TurnEndedUpdate` import and delete the workaround. + +## 2. The local runtime emits no token-usage signal at all + +Distinct from (and more fundamental than) the type-resolution gap above: even at **runtime**, the `@cursor/sdk` *local* runtime never emits a usage signal, so there is nothing to read regardless of types. + +Confirmed by a probe against `@cursor/sdk@1.0.15`: + +- The local `run.stream()` yields only `status` and `assistant` messages — **no `turnEnded`/`usage` event** (that update is streamed only by the *cloud* runtime). +- The `run.wait()` outcome (`{ id, status, result, model, durationMs }`) carries wall-clock but **no tokens**. +- The cloud `getRun → V1Run` (`{ id, agentId, status, createdAt, updatedAt, durationMs?, result?, git? }`), `RunResultMetadata`, and the `analytics` surface (emit-only `trackSdkRun*`; props carry `turn_count`/latency/`end_reason`) all carry **no token counts**. + +### Impact on this harness + +For local runs, `tokens` is `null` (with a manifest note), and **`wall_clock_ms` (the outcome's `durationMs`) is the primary efficiency metric.** `accumulateUsage` remains wired, so usage flows automatically if a cloud run (which does stream `turnEnded`) is used, or once a non-SDK local token source exists. + +### Suggested fix (upstream) + +Stream `turnEnded` (with `usage`) from the local runtime as the cloud runtime already does, or expose per-run token counts on the run outcome / a queryable usage API. diff --git a/skills-contrib/drive-judge-harness/SKILL.md b/skills-contrib/drive-judge-harness/SKILL.md index 9f1c7fc579..da63dac466 100644 --- a/skills-contrib/drive-judge-harness/SKILL.md +++ b/skills-contrib/drive-judge-harness/SKILL.md @@ -2,14 +2,15 @@ name: drive-judge-harness description: > Spawns one Drive orchestrator run on a golden-case brief with a pinned model, - accumulates per-run token usage, and writes a run manifest — the corpus - generator the Drive LLM judge calibrates against. Supports a pinned skill-bundle - input (prepare-run → spawn → collect-run) so runs are reproducible against a - known base ref + skill version. Use when you want to run a golden Drive brief - end-to-end, produce a natively-instrumented run, accumulate token usage from the - Cursor SDK, or validate the post-hoc trace parser against a transcript corpus. - Live execution is gated behind --live + CURSOR_API_KEY; the default is a dry-run - that makes no live call. + records per-run token usage / USD cost / wall-clock, and writes a run manifest — + the corpus generator the Drive LLM judge calibrates against. Runs on the Claude + Agent SDK by default (Cursor SDK selectable via --runtime cursor). Supports a + pinned skill-bundle input (prepare-run → spawn → collect-run) so runs are + reproducible against a known base ref + skill version. Use when you want to run a + golden Drive brief end-to-end, produce a natively-instrumented run, capture + token/cost/wall-clock, or validate the post-hoc trace parser against a transcript + corpus. Live execution is gated behind --live + the runtime's API key; the default + is a dry-run that makes no live call. --- # Drive: Judge harness (run-one-brief / run-arm) @@ -48,24 +49,56 @@ builds the k=N A/B loop on top of. - `run-one-brief.ts` — `runOneBrief(config, deps)` + a CLI. Owns the live-execution gate and orchestration. Accepts `runDir` so the orchestrator spawns inside the prepared checkout. -- `sdk-adapter.ts` — the **only** module that touches `@cursor/sdk`, via a - dynamic import reached solely on the live path. Uses the `cwd` passed from - `run-one-brief` rather than the harness's `process.cwd()`. +- `claude-adapter.ts` — the **default** live runtime: the only module that + imports `@anthropic-ai/claude-agent-sdk`, via a dynamic import reached solely on + the live claude path. Runs the orchestrator with the injected `.claude/skills/` + loaded (`settingSources: ['project']`, `skills: 'all'`) inside the prepared + checkout, unattended (`permissionMode: 'bypassPermissions'`), with an optional + `maxBudgetUsd` hard cap. Reports tokens + `total_cost_usd` + `duration_ms` + + `num_turns` natively. +- `sdk-adapter.ts` — the **secondary** Cursor runtime: the only module that + touches `@cursor/sdk`, via a dynamic import reached solely on the live cursor + path. Selected with `--runtime cursor`. +- `claude-events.ts` / `sdk-events.ts` — pure message-shape mappers for each + runtime (no SDK import), so the adapters' extraction logic is unit-testable with + neither SDK installed. - `validate-parser.ts` — validates `drive-diagnose-run/posthoc.ts` over a transcript corpus, tallying reconstruction confidence (clears TML-2728). - `judge/` — the bespoke-minimal LLM judge (TML-2736). Grades one Drive run from its diff + acceptance set + trace excerpts and emits the `intent` correctness component the scorecard already reads. See § The LLM judge below. +## Runtimes (claude default, cursor secondary) + +The harness is decoupled from any one agent vendor by a small seam in +`run-one-brief.ts` (`CreateAgent` / `OrchestratorRun` / `RunOutcome`); each runtime +is one adapter behind it, selected with `--runtime ` (default +`claude`). + +- **`claude`** (default) — Anthropic's Claude Agent SDK. The native home of the + SKILL.md + subagent conventions the drive-* skills use, and it reports + per-run **tokens, USD cost (`cost_usd`), wall-clock (`wall_clock_ms`), and turn + count (`num_turns`)** on its terminal result message. Key: `ANTHROPIC_API_KEY`. + Supports `--max-budget-usd ` (a hard per-run dollar cap; the run aborts with + an error result if the estimate reaches it). +- **`cursor`** (secondary) — `@cursor/sdk`. Kept for spot-checking the Cursor + substrate. Its **local runtime emits no token usage** (see KNOWN-ISSUES.md and + the spike), so cursor runs record `tokens: null` and rely on `wall_clock_ms` + alone. Key: `CURSOR_API_KEY`. + +The model is pinned with `--model` (e.g. `claude-haiku-4-5` for cheap calibration, +`claude-sonnet-4-5` for realistic runs); the harness never hardcodes one. + ## The live-execution gate (safety property) -A live run requires **both** `--live` **and** a present `CURSOR_API_KEY`. -Otherwise the harness takes the **dry-run** path: it never imports `@cursor/sdk`, -never makes a network call, and writes a manifest with `status: "dry-run"`, -`tokens: null`. Because the SDK is reached only through `sdk-adapter.ts`'s -dynamic import on the live path, **typecheck / test / lint / CI all stay green -with no `CURSOR_API_KEY` set and `@cursor/sdk` not installed.** Tests inject a -mock `createAgent` and never make a live call. +A live run requires **both** `--live` **and** the selected runtime's API key +present (`ANTHROPIC_API_KEY` for claude, `CURSOR_API_KEY` for cursor). Otherwise +the harness takes the **dry-run** path: it never imports an SDK, never makes a +network call, and writes a manifest with `status: "dry-run"`, `tokens: null`. +Because each SDK is reached only through its adapter's dynamic import on the live +path, **typecheck / test / lint / CI all stay green with no API key set and +neither SDK installed.** Tests inject a mock `createAgent` and never make a live +call. ## The pinned skill-bundle pipeline (run-arm) @@ -168,6 +201,18 @@ trace via the emitter. The spawned orchestrator self-instruments its Drive methodology events into `--trace-file` via `drive-record-traces`; the harness owns only the token manifest. +**Token availability is per-runtime.** On the default **claude** runtime the +terminal result message carries cumulative `usage` + `total_cost_usd`, so a run +records real `tokens` **and** `cost_usd` (plus `num_turns` and `wall_clock_ms`). +On the secondary **cursor** runtime the *local* `@cursor/sdk` runtime emits no +usage signal at all — nothing in its stream, run outcome, `getRun`/`V1Run` cloud +query, or `analytics` surface carries token counts (see KNOWN-ISSUES.md § 2 for +the probe findings) — so cursor runs record `tokens: null` with a note, and +fall back to **`wall_clock_ms`** as the efficiency metric. `run-one-brief` reads +`tokens` from the run outcome when the runtime provides them (claude), else +accumulates per-turn usage (the cursor path), so both runtimes feed the same +`TokenTotals` shape. + ## The LLM judge (`judge/`) A bespoke-minimal grader that turns the run's artifacts (diff + golden diff --git a/skills-contrib/drive-judge-harness/claude-adapter.ts b/skills-contrib/drive-judge-harness/claude-adapter.ts new file mode 100644 index 0000000000..e142719186 --- /dev/null +++ b/skills-contrib/drive-judge-harness/claude-adapter.ts @@ -0,0 +1,94 @@ +import { outcomeFromResult, streamEventFromMessage } from './claude-events.ts'; +import type { CreateAgent, OrchestratorRun, RunOutcome } from './run-one-brief.ts'; +import { isRecord } from './sdk-events.ts'; + +// The ONLY module that imports `@anthropic-ai/claude-agent-sdk`, loaded lazily +// by run-one-brief on the live claude path. Never reached under test (tests +// inject a mock `createAgent`). The SDK is not installed during development or +// CI; the dynamic import path is the only gate. +// +// `query({ prompt, options })` returns an async generator of messages. We iterate +// the entire generator in `stream()`, yielding normalized RunStreamEvents, and +// capture the terminal `result` message for `wait()` to consume. run-one-brief +// drains `stream()` fully before calling `wait()`, so the captured result is +// always available by then. +// +// Pure message-shape mappers live in `claude-events.ts` — no SDK import there, +// fully unit-testable with the SDK absent. + +/** Normalize an SDK `query()` generator into the harness's `OrchestratorRun`. */ +function adaptQuery(generator: AsyncIterable): OrchestratorRun { + let capturedResult: unknown = null; + return { + async *stream() { + for await (const msg of generator) { + if (isResultMessage(msg)) { + capturedResult = msg; + } + yield streamEventFromMessage(msg); + } + }, + async wait(): Promise { + const parsed = capturedResult !== null ? outcomeFromResult(capturedResult) : null; + if (parsed === null) { + return { + status: 'error', + runId: null, + agentId: null, + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, + }; + } + return { + status: parsed.status, + runId: parsed.runId, + agentId: null, + durationMs: parsed.durationMs, + tokens: parsed.tokens, + costUsd: parsed.costUsd, + numTurns: parsed.numTurns, + }; + }, + }; +} + +function isResultMessage(msg: unknown): msg is Record { + return isRecord(msg) && msg.type === 'result'; +} + +function isAsyncIterable(v: unknown): v is AsyncIterable { + return v !== null && typeof v === 'object' && Symbol.asyncIterator in v; +} + +/** Live `CreateAgent` backed by `@anthropic-ai/claude-agent-sdk`. Reached only + * on the live claude path. */ +export const createClaudeAgent: CreateAgent = async ({ model, prompt, cwd, maxBudgetUsd }) => { + const apiKey = process.env.ANTHROPIC_API_KEY; + if (typeof apiKey !== 'string' || apiKey.length === 0) { + throw new Error('ANTHROPIC_API_KEY is required for a live claude run'); + } + + // Dynamic import keeps this module evaluatable (typecheck/lint/test) with + // @anthropic-ai/claude-agent-sdk absent from node_modules. + const { query } = await import('@anthropic-ai/claude-agent-sdk'); + + const options: Record = { + cwd, + model, + settingSources: ['project'], + skills: 'all', + permissionMode: 'bypassPermissions', + allowDangerouslySkipPermissions: true, + }; + if (maxBudgetUsd != null) { + options.maxBudgetUsd = maxBudgetUsd; + } + + const rawResult: unknown = query({ prompt, options }); + if (!isAsyncIterable(rawResult)) { + throw new Error('SDK query() did not return an AsyncIterable'); + } + return adaptQuery(rawResult); +}; diff --git a/skills-contrib/drive-judge-harness/claude-events.ts b/skills-contrib/drive-judge-harness/claude-events.ts new file mode 100644 index 0000000000..6520ee20e1 --- /dev/null +++ b/skills-contrib/drive-judge-harness/claude-events.ts @@ -0,0 +1,82 @@ +import type { RunStreamEvent } from './run-one-brief.ts'; +import { asString, isRecord } from './sdk-events.ts'; +import { accumulateUsage, type TokenTotals, type TurnUsage } from './usage.ts'; + +// Pure message-shape mappers for the Anthropic Claude Agent SDK. +// +// These operate over `unknown` and have zero dependency on +// `@anthropic-ai/claude-agent-sdk`, so they can be unit-tested with the SDK +// absent. The sole SDK importer remains `claude-adapter.ts`, which wires these +// utilities into the live path. +// +// Real shapes from @anthropic-ai/claude-agent-sdk (confirmed from SDK docs): +// +// stream assistant: { type: "assistant", message: { id, usage: { +// input_tokens, output_tokens, +// cache_creation_input_tokens, +// cache_read_input_tokens } } } +// terminal result: { type: "result", subtype: "success"|"error_max_turns"|..., +// session_id, duration_ms, num_turns, total_cost_usd, +// usage: { input_tokens, output_tokens, +// cache_creation_input_tokens, +// cache_read_input_tokens } } +// +// Field mapping (SDK snake_case -> harness camelCase): +// input_tokens -> inputTokens +// output_tokens -> outputTokens +// cache_read_input_tokens -> cacheReadTokens +// cache_creation_input_tokens -> cacheWriteTokens + +function mapUsage(usage: Record): TurnUsage { + const num = (v: unknown): number | null => (typeof v === 'number' ? v : null); + return { + inputTokens: num(usage.input_tokens), + outputTokens: num(usage.output_tokens), + cacheReadTokens: num(usage.cache_read_input_tokens), + cacheWriteTokens: num(usage.cache_creation_input_tokens), + }; +} + +/** Read usage from an assistant message's nested `message.usage` object. + * Returns null if the message is not an assistant type or has no usage. */ +export function usageFromAssistant(msg: unknown): TurnUsage | null { + if (!isRecord(msg) || msg.type !== 'assistant') return null; + const message = msg.message; + if (!isRecord(message)) return null; + const usage = message.usage; + if (!isRecord(usage)) return null; + return mapUsage(usage); +} + +/** Map a raw Claude SDK stream message onto a normalized `RunStreamEvent`. + * An assistant message with usage maps to `turn-ended`; everything else is `other`. */ +export function streamEventFromMessage(msg: unknown): RunStreamEvent { + const usage = usageFromAssistant(msg); + if (usage !== null) return { kind: 'turn-ended', usage }; + return { kind: 'other' }; +} + +/** Map a raw Claude SDK terminal `result` message to the harness outcome fields. + * Returns null when `msg.type !== 'result'`. Degrades gracefully on non-records. */ +export function outcomeFromResult(msg: unknown): { + status: 'finished' | 'error'; + runId: string | null; + tokens: TokenTotals | null; + durationMs: number | null; + costUsd: number | null; + numTurns: number | null; +} | null { + if (!isRecord(msg) || msg.type !== 'result') return null; + const status: 'finished' | 'error' = msg.subtype === 'success' ? 'finished' : 'error'; + const runId = asString(msg.session_id); + const durationMs = typeof msg.duration_ms === 'number' ? msg.duration_ms : null; + const costUsd = typeof msg.total_cost_usd === 'number' ? msg.total_cost_usd : null; + const numTurns = typeof msg.num_turns === 'number' ? msg.num_turns : null; + + const usageRaw = msg.usage; + const tokens: TokenTotals | null = isRecord(usageRaw) + ? accumulateUsage([mapUsage(usageRaw)]) + : null; + + return { status, runId, tokens, durationMs, costUsd, numTurns }; +} diff --git a/skills-contrib/drive-judge-harness/collect-run.ts b/skills-contrib/drive-judge-harness/collect-run.ts index fb2e70b123..51935332b6 100644 --- a/skills-contrib/drive-judge-harness/collect-run.ts +++ b/skills-contrib/drive-judge-harness/collect-run.ts @@ -1,9 +1,9 @@ import { spawnSync } from 'node:child_process'; -import { type Dirent, readdirSync, readFileSync, statSync } from 'node:fs'; +import { readFileSync, statSync } from 'node:fs'; import { type } from 'arktype'; -import { join } from 'pathe'; import { Slice1TraceEvent } from '../drive-record-traces/schema.ts'; import type { PreparedRun } from './prepare-run.ts'; +import { findJsonlFiles } from './trace-files.ts'; export type CollectedRun = { tracePaths: string[]; @@ -13,25 +13,6 @@ export type CollectedRun = { untraced: boolean; }; -function findJsonlFiles(dir: string): string[] { - const results: string[] = []; - let entries: Dirent[]; - try { - entries = readdirSync(dir, { withFileTypes: true }); - } catch { - return results; - } - for (const entry of entries) { - const fullPath = join(dir, entry.name); - if (entry.isDirectory()) { - results.push(...findJsonlFiles(fullPath)); - } else if (entry.isFile() && entry.name.endsWith('.jsonl')) { - results.push(fullPath); - } - } - return results; -} - function firstLineOf(filePath: string): string | null { let content: string; try { @@ -106,9 +87,10 @@ export function collectRun( prepared: PreparedRun, opts?: { agentId?: string | null }, ): CollectedRun { - const { runDir, prepareCommit } = prepared; + const { runDir, prepareCommit, preexistingTracePaths } = prepared; - const allJsonl = findJsonlFiles(runDir); + const preexistingSet = new Set(preexistingTracePaths); + const allJsonl = findJsonlFiles(runDir).filter((p) => !preexistingSet.has(p)); const tracePaths = allJsonl.filter(isValidTrace); let matchedTrace: string | null = null; diff --git a/skills-contrib/drive-judge-harness/manifest.ts b/skills-contrib/drive-judge-harness/manifest.ts index a2ed635a2d..6b1e14ca31 100644 --- a/skills-contrib/drive-judge-harness/manifest.ts +++ b/skills-contrib/drive-judge-harness/manifest.ts @@ -23,13 +23,22 @@ export type RunManifest = { schema_version: '1'; case_slug: string; model: string; + /** The adapter runtime used for this run. */ + runtime: 'claude' | 'cursor'; status: RunStatus; run_id: string | null; agent_id: string | null; trace_file: string; /** Accumulated per-run usage, or `null` when no live run produced a signal - * (dry-run, or a startup failure before any turn completed). */ + * (dry-run, startup failure, or local runtime which emits no usage events). */ tokens: TokenTotals | null; + /** Wall-clock duration reported by the SDK (`wait()` outcome `durationMs`), + * or `null` for dry-run, startup-failed, and error paths. */ + wall_clock_ms: number | null; + /** Total USD cost reported by the runtime, or `null` when unavailable. */ + cost_usd: number | null; + /** Number of turns reported by the runtime, or `null` when unavailable. */ + num_turns: number | null; started_at: string; finished_at: string | null; notes: string[]; diff --git a/skills-contrib/drive-judge-harness/prepare-run.ts b/skills-contrib/drive-judge-harness/prepare-run.ts index cec1c45cdc..3c9ce740c8 100644 --- a/skills-contrib/drive-judge-harness/prepare-run.ts +++ b/skills-contrib/drive-judge-harness/prepare-run.ts @@ -1,4 +1,5 @@ import { spawnSync } from 'node:child_process'; +import { findJsonlFiles } from './trace-files.ts'; export type SkillBundleRef = { repoDir: string; @@ -19,6 +20,11 @@ export type PreparedRun = { skillBundleSha: string; prepareCommit: string; materialized: boolean; + /** Paths of all `.jsonl` files present under `runDir` immediately after the + * baseline commit — i.e. traces committed in the base checkout before the + * agent run starts. `collectRun` excludes these so only run-emitted traces + * are collected. Deterministic snapshot (no mtime reliance). */ + preexistingTracePaths: string[]; }; export type PrepareRunDeps = { @@ -97,6 +103,8 @@ export function prepareRun(config: PrepareRunConfig, deps?: PrepareRunDeps): Pre git(['commit', '--allow-empty', '-m', 'prepare-run baseline'], config.runDir); const prepareCommit = git(['rev-parse', 'HEAD'], config.runDir).stdout; + const preexistingTracePaths = findJsonlFiles(config.runDir); + return { runDir: config.runDir, baseRef: config.baseRef, @@ -104,5 +112,6 @@ export function prepareRun(config: PrepareRunConfig, deps?: PrepareRunDeps): Pre skillBundleSha, prepareCommit, materialized: matResult.ok, + preexistingTracePaths, }; } diff --git a/skills-contrib/drive-judge-harness/run-arm.ts b/skills-contrib/drive-judge-harness/run-arm.ts index 7abc4ac901..bf122cbf00 100644 --- a/skills-contrib/drive-judge-harness/run-arm.ts +++ b/skills-contrib/drive-judge-harness/run-arm.ts @@ -23,6 +23,10 @@ export type RunArmConfig = { manifestFile: string; live: boolean; apiKeyPresent: boolean; + /** The adapter runtime to use. Defaults to `'claude'` in the CLI. */ + runtime: 'claude' | 'cursor'; + /** Optional hard per-run USD budget cap (Claude adapter only). */ + maxBudgetUsd?: number; }; export type RunArmDeps = { @@ -57,6 +61,8 @@ export async function runArm(config: RunArmConfig, deps?: RunArmDeps): Promise --base-ref --bundle-ref --run-dir ' + '--case --model ' + - '[--bundle-repo ] [--manifest-file ] [--live]\n' + - 'Live execution requires both --live and CURSOR_API_KEY.'; + '[--bundle-repo ] [--manifest-file ] [--live] ' + + '[--runtime ] [--max-budget-usd ]\n' + + 'Live execution requires both --live and the runtime API key. Default runtime is claude.'; function parseArgs(argv: string[]): { repo?: string; @@ -101,6 +108,8 @@ function parseArgs(argv: string[]): { model?: string; manifestFile?: string; live: boolean; + runtime: 'claude' | 'cursor'; + maxBudgetUsd?: number; } { let repo: string | undefined; let baseRef: string | undefined; @@ -111,6 +120,8 @@ function parseArgs(argv: string[]): { let model: string | undefined; let manifestFile: string | undefined; let live = false; + let runtime: 'claude' | 'cursor' = 'claude'; + let maxBudgetUsd: number | undefined; for (let i = 0; i < argv.length; i++) { const arg = argv[i]; @@ -153,12 +164,42 @@ function parseArgs(argv: string[]): { case '--live': live = true; break; + case '--runtime': { + const val = takeValue(); + if (val !== 'claude' && val !== 'cursor') { + process.stderr.write(`--runtime must be "claude" or "cursor"\n${USAGE}\n`); + process.exit(1); + } + runtime = val; + break; + } + case '--max-budget-usd': { + const val = Number(takeValue()); + if (!Number.isFinite(val) || val <= 0) { + process.stderr.write(`--max-budget-usd must be a positive number\n${USAGE}\n`); + process.exit(1); + } + maxBudgetUsd = val; + break; + } default: process.stderr.write(`Unknown argument: ${arg}\n${USAGE}\n`); process.exit(1); } } - return { repo, baseRef, bundleRef, bundleRepo, runDir, caseDir, model, manifestFile, live }; + return { + repo, + baseRef, + bundleRef, + bundleRepo, + runDir, + caseDir, + model, + manifestFile, + live, + runtime, + maxBudgetUsd, + }; } async function main(): Promise { @@ -179,6 +220,9 @@ async function main(): Promise { const bundleRepoDir = parsed.bundleRepo ?? repoUnderTestDir; const manifestFile = parsed.manifestFile ?? join(parsed.runDir, 'run-manifest.json'); const traceFile = join(parsed.runDir, 'run-trace.jsonl'); + const runtime = parsed.runtime; + const apiKeyEnvVar = runtime === 'cursor' ? 'CURSOR_API_KEY' : 'ANTHROPIC_API_KEY'; + const apiKeyValue = process.env[apiKeyEnvVar]; const result = await runArm({ repoUnderTestDir, @@ -190,8 +234,9 @@ async function main(): Promise { traceFile, manifestFile, live: parsed.live, - apiKeyPresent: - typeof process.env.CURSOR_API_KEY === 'string' && process.env.CURSOR_API_KEY.length > 0, + apiKeyPresent: typeof apiKeyValue === 'string' && apiKeyValue.length > 0, + runtime, + maxBudgetUsd: parsed.maxBudgetUsd, }); process.stdout.write(`${result.manifestContent}\n`); diff --git a/skills-contrib/drive-judge-harness/run-one-brief.ts b/skills-contrib/drive-judge-harness/run-one-brief.ts index 313cb02540..6298065a5d 100644 --- a/skills-contrib/drive-judge-harness/run-one-brief.ts +++ b/skills-contrib/drive-judge-harness/run-one-brief.ts @@ -28,6 +28,10 @@ export type RunOutcome = { status: 'finished' | 'error'; runId: string | null; agentId: string | null; + durationMs: number | null; + tokens: TokenTotals | null; + costUsd: number | null; + numTurns: number | null; }; /** A started orchestrator run the harness observes. */ @@ -37,11 +41,12 @@ export type OrchestratorRun = { }; /** Spawns an orchestrator run for a pinned model + prompt. Injected in tests; - * the live default is loaded lazily from `sdk-adapter.ts`. */ + * the live default is loaded lazily from the matching adapter module. */ export type CreateAgent = (opts: { model: string; prompt: string; cwd: string; + maxBudgetUsd?: number; }) => Promise; export type RunOneBriefConfig = { @@ -54,8 +59,12 @@ export type RunOneBriefConfig = { runDir: string; /** Caller asked for a live run. */ live: boolean; - /** Whether a Cursor API key is present in the environment. */ + /** Whether the runtime's API key is present in the environment. */ apiKeyPresent: boolean; + /** The adapter runtime to use. Defaults to `'claude'` in the CLI. */ + runtime: 'claude' | 'cursor'; + /** Optional hard per-run USD budget cap (Claude adapter only). */ + maxBudgetUsd?: number; }; export type RunOneBriefDeps = { @@ -85,11 +94,15 @@ function gateSatisfied(config: RunOneBriefConfig): boolean { return config.live && config.apiKeyPresent; } -async function defaultCreateAgent(): Promise { - // Lazy import so `@cursor/sdk` is only required when a live run is actually +async function defaultCreateAgent(runtime: 'claude' | 'cursor'): Promise { + // Lazy import so the SDK is only required when a live run is actually // requested without an injected agent. Never reached under test. - const adapter = await import('./sdk-adapter.ts'); - return adapter.createCursorAgent; + if (runtime === 'cursor') { + const adapter = await import('./sdk-adapter.ts'); + return adapter.createCursorAgent; + } + const adapter = await import('./claude-adapter.ts'); + return adapter.createClaudeAgent; } /** Run one brief end-to-end (or dry-run) and write the manifest. */ @@ -107,18 +120,23 @@ export async function runOneBrief( model: config.model, trace_file: config.traceFile, started_at: startedAt, + runtime: config.runtime, } as const; if (!gateSatisfied(config)) { + const keyName = config.runtime === 'cursor' ? 'CURSOR_API_KEY' : 'ANTHROPIC_API_KEY'; const reason = !config.live - ? 'dry-run: live execution not requested (pass --live and set CURSOR_API_KEY to run live)' - : 'dry-run: live requested but CURSOR_API_KEY is absent'; + ? `dry-run: live execution not requested (pass --live and set ${keyName} to run live)` + : `dry-run: live requested but ${keyName} is absent`; const manifest: RunManifest = { ...baseManifest, status: 'dry-run', run_id: null, agent_id: null, tokens: null, + wall_clock_ms: null, + cost_usd: null, + num_turns: null, finished_at: now(), notes: [reason, 'no SDK call was made; no orchestrator run was spawned'], }; @@ -126,12 +144,17 @@ export async function runOneBrief( return { status: 'dry-run', manifest, manifestContent, createAgentCalled: false }; } - const createAgent = deps.createAgent ?? (await defaultCreateAgent()); + const createAgent = deps.createAgent ?? (await defaultCreateAgent(config.runtime)); const prompt = assemblePrompt(golden); let run: OrchestratorRun; try { - run = await createAgent({ model: config.model, prompt, cwd: config.runDir }); + run = await createAgent({ + model: config.model, + prompt, + cwd: config.runDir, + maxBudgetUsd: config.maxBudgetUsd, + }); } catch (err) { const manifest: RunManifest = { ...baseManifest, @@ -139,6 +162,9 @@ export async function runOneBrief( run_id: null, agent_id: null, tokens: null, + wall_clock_ms: null, + cost_usd: null, + num_turns: null, finished_at: now(), notes: [`startup-failed: ${err instanceof Error ? err.message : String(err)}`], }; @@ -154,7 +180,16 @@ export async function runOneBrief( } } const outcome = await run.wait(); - const tokens: TokenTotals = accumulateUsage(usageUpdates); + const accumulatedTokens: TokenTotals | null = + usageUpdates.length > 0 ? accumulateUsage(usageUpdates) : null; + const tokens: TokenTotals | null = outcome.tokens ?? accumulatedTokens; + + const notes: string[] = []; + if (outcome.status === 'finished' && tokens === null) { + notes.push( + 'tokens unavailable: @cursor/sdk local runtime emits no usage events (see KNOWN-ISSUES.md)', + ); + } const manifest: RunManifest = { ...baseManifest, @@ -162,8 +197,11 @@ export async function runOneBrief( run_id: outcome.runId, agent_id: outcome.agentId, tokens, + wall_clock_ms: outcome.durationMs, + cost_usd: outcome.costUsd, + num_turns: outcome.numTurns, finished_at: now(), - notes: [], + notes, }; const manifestContent = writeManifest(config.manifestFile, manifest); return { status: outcome.status, manifest, manifestContent, createAgentCalled: true }; @@ -171,13 +209,17 @@ export async function runOneBrief( // A live stream/wait can throw mid-run; write an error manifest with the // usage gathered so far so the token signal and the failure survive rather // than escaping as an unhandled rejection out of `void main()`. - const tokens: TokenTotals = accumulateUsage(usageUpdates); + const tokens: TokenTotals | null = + usageUpdates.length > 0 ? accumulateUsage(usageUpdates) : null; const manifest: RunManifest = { ...baseManifest, status: 'error', run_id: null, agent_id: null, tokens, + wall_clock_ms: null, + cost_usd: null, + num_turns: null, finished_at: now(), notes: [`error: ${err instanceof Error ? err.message : String(err)}`], }; @@ -193,8 +235,8 @@ export async function runOneBrief( const USAGE = 'Usage: node skills-contrib/drive-judge-harness/run-one-brief.ts ' + '--case --model [--trace-file ] ' + - '[--manifest-file ] [--live]\n' + - 'Live execution requires both --live and CURSOR_API_KEY. Default is dry-run.'; + '[--manifest-file ] [--live] [--runtime ] [--max-budget-usd ]\n' + + 'Live execution requires both --live and the runtime API key. Default runtime is claude.'; function parseArgs(argv: string[]): { caseDir?: string; @@ -202,12 +244,16 @@ function parseArgs(argv: string[]): { traceFile?: string; manifestFile?: string; live: boolean; + runtime: 'claude' | 'cursor'; + maxBudgetUsd?: number; } { let caseDir: string | undefined; let model: string | undefined; let traceFile: string | undefined; let manifestFile: string | undefined; let live = false; + let runtime: 'claude' | 'cursor' = 'claude'; + let maxBudgetUsd: number | undefined; for (let i = 0; i < argv.length; i++) { const arg = argv[i]; const takeValue = (): string => { @@ -238,12 +284,30 @@ function parseArgs(argv: string[]): { case '--live': live = true; break; + case '--runtime': { + const val = takeValue(); + if (val !== 'claude' && val !== 'cursor') { + process.stderr.write(`--runtime must be "claude" or "cursor"\n${USAGE}\n`); + process.exit(1); + } + runtime = val; + break; + } + case '--max-budget-usd': { + const val = Number(takeValue()); + if (!Number.isFinite(val) || val <= 0) { + process.stderr.write(`--max-budget-usd must be a positive number\n${USAGE}\n`); + process.exit(1); + } + maxBudgetUsd = val; + break; + } default: process.stderr.write(`Unknown argument: ${arg}\n${USAGE}\n`); process.exit(1); } } - return { caseDir, model, traceFile, manifestFile, live }; + return { caseDir, model, traceFile, manifestFile, live, runtime, maxBudgetUsd }; } async function main(): Promise { @@ -254,6 +318,9 @@ async function main(): Promise { } const traceFile = parsed.traceFile ?? join(parsed.caseDir, 'run-trace.jsonl'); const manifestFile = parsed.manifestFile ?? join(parsed.caseDir, 'run-manifest.json'); + const runtime = parsed.runtime; + const apiKeyEnvVar = runtime === 'cursor' ? 'CURSOR_API_KEY' : 'ANTHROPIC_API_KEY'; + const apiKeyValue = process.env[apiKeyEnvVar]; const result = await runOneBrief({ caseDir: parsed.caseDir, @@ -262,8 +329,9 @@ async function main(): Promise { manifestFile, runDir: process.cwd(), live: parsed.live, - apiKeyPresent: - typeof process.env.CURSOR_API_KEY === 'string' && process.env.CURSOR_API_KEY.length > 0, + apiKeyPresent: typeof apiKeyValue === 'string' && apiKeyValue.length > 0, + runtime, + maxBudgetUsd: parsed.maxBudgetUsd, }); process.stdout.write(`${result.manifestContent}\n`); diff --git a/skills-contrib/drive-judge-harness/sdk-adapter.ts b/skills-contrib/drive-judge-harness/sdk-adapter.ts index 5660a6b28e..7555f0375d 100644 --- a/skills-contrib/drive-judge-harness/sdk-adapter.ts +++ b/skills-contrib/drive-judge-harness/sdk-adapter.ts @@ -1,6 +1,6 @@ import { Agent } from '@cursor/sdk'; -import type { CreateAgent, OrchestratorRun, RunOutcome, RunStreamEvent } from './run-one-brief.ts'; -import type { TurnUsage } from './usage.ts'; +import type { CreateAgent, OrchestratorRun, RunOutcome } from './run-one-brief.ts'; +import { agentIdFromMessage, outcomeFromResult, streamEventFromMessage } from './sdk-events.ts'; // The ONLY module that touches `@cursor/sdk`, loaded lazily by run-one-brief on // the live path, so typecheck / tests / lint / dry-run never require it. @@ -14,58 +14,10 @@ import type { TurnUsage } from './usage.ts'; // rather than fabricating a full mirror of the SDK's type surface. When upstream // ships self-contained declarations, replace these reads with the real types. // See ./KNOWN-ISSUES.md. - -function isRecord(value: unknown): value is Record { - return typeof value === 'object' && value !== null && !Array.isArray(value); -} - -function asString(value: unknown): string | null { - return typeof value === 'string' ? value : null; -} - -function extractUsage(raw: unknown): TurnUsage | null { - if (!isRecord(raw)) return null; - const usage = raw.usage; - if (!isRecord(usage)) return null; - const num = (v: unknown): number | null => (typeof v === 'number' ? v : null); - return { - inputTokens: num(usage.inputTokens), - outputTokens: num(usage.outputTokens), - cacheReadTokens: num(usage.cacheReadTokens), - cacheWriteTokens: num(usage.cacheWriteTokens), - }; -} - -function extractText(raw: unknown): string | null { - if (!isRecord(raw) || raw.type !== 'assistant') return null; - const message = raw.message; - if (!isRecord(message)) return null; - const content = message.content; - if (!Array.isArray(content)) return null; - let text = ''; - for (const block of content) { - if (isRecord(block) && block.type === 'text' && typeof block.text === 'string') { - text += block.text; - } - } - return text.length > 0 ? text : null; -} - -function toStreamEvent(message: unknown): RunStreamEvent { - const usage = extractUsage(message); - if (usage !== null) return { kind: 'turn-ended', usage }; - const text = extractText(message); - if (text !== null) return { kind: 'text', text }; - return { kind: 'other' }; -} - -function adaptOutcome(raw: unknown): RunOutcome { - if (!isRecord(raw)) { - return { status: 'error', runId: null, agentId: null }; - } - const status = raw.status === 'finished' ? 'finished' : 'error'; - return { status, runId: asString(raw.id), agentId: asString(raw.agentId) }; -} +// +// Pure message-shape mappers (isRecord, asString, extractUsage, extractText, +// streamEventFromMessage, agentIdFromMessage, outcomeFromResult) live in +// sdk-events.ts — no SDK import there, fully unit-testable without the SDK. /** Normalize a started SDK run into the harness's `OrchestratorRun`. Reads the * run's `stream()` / `wait()` (documented runtime API); the yielded messages @@ -74,14 +26,28 @@ function adaptRun(sdkRun: { stream(): AsyncIterable; wait(): Promise; }): OrchestratorRun { + let capturedAgentId: string | null = null; return { async *stream() { for await (const message of sdkRun.stream()) { - yield toStreamEvent(message); + if (capturedAgentId === null) { + capturedAgentId = agentIdFromMessage(message); + } + yield streamEventFromMessage(message); } }, - async wait() { - return adaptOutcome(await sdkRun.wait()); + async wait(): Promise { + const raw = await sdkRun.wait(); + const { status, runId, durationMs } = outcomeFromResult(raw); + return { + status, + runId, + agentId: capturedAgentId, + durationMs, + tokens: null, + costUsd: null, + numTurns: null, + }; }, }; } diff --git a/skills-contrib/drive-judge-harness/sdk-events.ts b/skills-contrib/drive-judge-harness/sdk-events.ts new file mode 100644 index 0000000000..f208a9d9c6 --- /dev/null +++ b/skills-contrib/drive-judge-harness/sdk-events.ts @@ -0,0 +1,85 @@ +import type { RunStreamEvent } from './run-one-brief.ts'; +import type { TurnUsage } from './usage.ts'; + +// Pure message-shape mappers for the Cursor SDK local runtime. +// +// These operate over `unknown` and have no dependency on `@cursor/sdk`, so they +// can be unit-tested with the SDK absent. The sole SDK importer remains +// `sdk-adapter.ts`, which imports these utilities and wires them into the live path. +// +// Real shapes from @cursor/sdk@1.0.15 local runtime (confirmed via a probe; +// see KNOWN-ISSUES.md § 2): +// +// stream status: { type: "status", agent_id, run_id, status } +// stream assistant: { type: "assistant", agent_id, run_id, message } +// wait() outcome: { id, status, result, model, durationMs } +// (no agent_id, no token/usage fields on the local runtime) + +export function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +export function asString(value: unknown): string | null { + return typeof value === 'string' ? value : null; +} + +export function extractUsage(raw: unknown): TurnUsage | null { + if (!isRecord(raw)) return null; + const usage = raw.usage; + if (!isRecord(usage)) return null; + const num = (v: unknown): number | null => (typeof v === 'number' ? v : null); + return { + inputTokens: num(usage.inputTokens), + outputTokens: num(usage.outputTokens), + cacheReadTokens: num(usage.cacheReadTokens), + cacheWriteTokens: num(usage.cacheWriteTokens), + }; +} + +export function extractText(raw: unknown): string | null { + if (!isRecord(raw) || raw.type !== 'assistant') return null; + const message = raw.message; + if (!isRecord(message)) return null; + const content = message.content; + if (!Array.isArray(content)) return null; + let text = ''; + for (const block of content) { + if (isRecord(block) && block.type === 'text' && typeof block.text === 'string') { + text += block.text; + } + } + return text.length > 0 ? text : null; +} + +/** Map a raw SDK stream message onto a normalized `RunStreamEvent`. */ +export function streamEventFromMessage(message: unknown): RunStreamEvent { + const usage = extractUsage(message); + if (usage !== null) return { kind: 'turn-ended', usage }; + const text = extractText(message); + if (text !== null) return { kind: 'text', text }; + return { kind: 'other' }; +} + +/** Read the snake_case `agent_id` from a stream message (`status` or + * `assistant`). Returns `null` for non-records or absent fields. */ +export function agentIdFromMessage(msg: unknown): string | null { + if (!isRecord(msg)) return null; + return asString(msg.agent_id); +} + +/** Map the raw `wait()` result to the fields the harness consumes. + * Real shape: `{ id, status, result, model, durationMs }`. + * Degrades gracefully: non-records → `{ status: 'error', runId: null, durationMs: null }`. */ +export function outcomeFromResult(raw: unknown): { + status: 'finished' | 'error'; + runId: string | null; + durationMs: number | null; +} { + if (!isRecord(raw)) { + return { status: 'error', runId: null, durationMs: null }; + } + const status: 'finished' | 'error' = raw.status === 'finished' ? 'finished' : 'error'; + const runId = asString(raw.id); + const durationMs = typeof raw.durationMs === 'number' ? raw.durationMs : null; + return { status, runId, durationMs }; +} diff --git a/skills-contrib/drive-judge-harness/test/claude-events.test.ts b/skills-contrib/drive-judge-harness/test/claude-events.test.ts new file mode 100644 index 0000000000..f14a7e41e3 --- /dev/null +++ b/skills-contrib/drive-judge-harness/test/claude-events.test.ts @@ -0,0 +1,176 @@ +import assert from 'node:assert/strict'; +import { describe, it } from 'node:test'; +import { outcomeFromResult, streamEventFromMessage, usageFromAssistant } from '../claude-events.ts'; + +// Real shapes from @anthropic-ai/claude-agent-sdk (confirmed from SDK docs). +// These tests must pass with @anthropic-ai/claude-agent-sdk NOT installed. + +const ASSISTANT_MESSAGE = { + type: 'assistant', + message: { + id: 'msg_01XFDUDYJgAACzvnptvVoYEL', + usage: { + input_tokens: 33, + output_tokens: 904, + cache_creation_input_tokens: 53995, + cache_read_input_tokens: 230827, + }, + }, +}; + +const SUCCESS_RESULT = { + type: 'result', + subtype: 'success', + session_id: 'sess-abc123', + duration_ms: 16025, + num_turns: 9, + total_cost_usd: 0.1839242, + usage: { + input_tokens: 33, + output_tokens: 904, + cache_creation_input_tokens: 53995, + cache_read_input_tokens: 230827, + }, + result: 'done', +}; + +const ERROR_MAX_TURNS_RESULT = { + type: 'result', + subtype: 'error_max_turns', + session_id: 'sess-def456', + duration_ms: 8000, + num_turns: 5, + total_cost_usd: 0.05, + usage: { + input_tokens: 10, + output_tokens: 20, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + result: null, +}; + +describe('usageFromAssistant', () => { + it('maps all four fields from message.usage', () => { + const usage = usageFromAssistant(ASSISTANT_MESSAGE); + assert.ok(usage !== null); + assert.equal(usage.inputTokens, 33); + assert.equal(usage.outputTokens, 904); + assert.equal(usage.cacheWriteTokens, 53995); + assert.equal(usage.cacheReadTokens, 230827); + }); + + it('returns null for a non-assistant type', () => { + assert.equal(usageFromAssistant({ type: 'result', subtype: 'success' }), null); + }); + + it('returns null for an assistant message without usage', () => { + assert.equal(usageFromAssistant({ type: 'assistant', message: { id: 'x' } }), null); + }); + + it('returns null for a non-record', () => { + assert.equal(usageFromAssistant('junk'), null); + assert.equal(usageFromAssistant(null), null); + assert.equal(usageFromAssistant(42), null); + }); +}); + +describe('streamEventFromMessage', () => { + it('maps an assistant message with usage to {kind:turn-ended}', () => { + const event = streamEventFromMessage(ASSISTANT_MESSAGE); + assert.equal(event.kind, 'turn-ended'); + assert.ok(event.kind === 'turn-ended' && event.usage.inputTokens === 33); + assert.ok(event.kind === 'turn-ended' && event.usage.cacheWriteTokens === 53995); + assert.ok(event.kind === 'turn-ended' && event.usage.cacheReadTokens === 230827); + }); + + it('maps a result message to {kind:other}', () => { + const event = streamEventFromMessage(SUCCESS_RESULT); + assert.equal(event.kind, 'other'); + }); + + it('maps junk to {kind:other}', () => { + assert.equal(streamEventFromMessage({ type: 'unknown' }).kind, 'other'); + assert.equal(streamEventFromMessage(null).kind, 'other'); + }); +}); + +describe('outcomeFromResult', () => { + it('extracts all fields from a success result', () => { + const outcome = outcomeFromResult(SUCCESS_RESULT); + assert.ok(outcome !== null); + assert.equal(outcome.status, 'finished'); + assert.equal(outcome.runId, 'sess-abc123'); + assert.equal(outcome.durationMs, 16025); + assert.equal(outcome.costUsd, 0.1839242); + assert.equal(outcome.numTurns, 9); + }); + + it('maps token fields correctly on a success result', () => { + const outcome = outcomeFromResult(SUCCESS_RESULT); + assert.ok(outcome !== null && outcome.tokens !== null); + assert.equal(outcome.tokens.inputTokens, 33); + assert.equal(outcome.tokens.outputTokens, 904); + assert.equal(outcome.tokens.cacheWriteTokens, 53995); + assert.equal(outcome.tokens.cacheReadTokens, 230827); + assert.equal(outcome.tokens.totalTokens, 33 + 904 + 53995 + 230827); + }); + + it('maps status=error for error_max_turns subtype', () => { + const outcome = outcomeFromResult(ERROR_MAX_TURNS_RESULT); + assert.ok(outcome !== null); + assert.equal(outcome.status, 'error'); + assert.equal(outcome.runId, 'sess-def456'); + assert.equal(outcome.durationMs, 8000); + assert.equal(outcome.costUsd, 0.05); + assert.equal(outcome.numTurns, 5); + }); + + it('maps token fields for error_max_turns result', () => { + const outcome = outcomeFromResult(ERROR_MAX_TURNS_RESULT); + assert.ok(outcome !== null && outcome.tokens !== null); + assert.equal(outcome.tokens.inputTokens, 10); + assert.equal(outcome.tokens.outputTokens, 20); + assert.equal(outcome.tokens.cacheWriteTokens, 0); + assert.equal(outcome.tokens.cacheReadTokens, 0); + assert.equal(outcome.tokens.totalTokens, 30); + }); + + it('returns null for a non-result type', () => { + assert.equal(outcomeFromResult(ASSISTANT_MESSAGE), null); + assert.equal(outcomeFromResult({ type: 'status' }), null); + }); + + it('returns null for a non-record', () => { + assert.equal(outcomeFromResult('junk'), null); + assert.equal(outcomeFromResult(null), null); + }); + + it('sets tokens:null when usage is absent', () => { + const noUsage = { ...SUCCESS_RESULT, usage: undefined }; + const outcome = outcomeFromResult(noUsage); + assert.ok(outcome !== null); + assert.equal(outcome.tokens, null); + }); + + it('sets costUsd:null when total_cost_usd is absent', () => { + const { total_cost_usd: _c, ...noUsd } = SUCCESS_RESULT; + const outcome = outcomeFromResult(noUsd); + assert.ok(outcome !== null); + assert.equal(outcome.costUsd, null); + }); + + it('sets durationMs:null when duration_ms is absent', () => { + const { duration_ms: _d, ...noDuration } = SUCCESS_RESULT; + const outcome = outcomeFromResult(noDuration); + assert.ok(outcome !== null); + assert.equal(outcome.durationMs, null); + }); + + it('sets numTurns:null when num_turns is absent', () => { + const { num_turns: _n, ...noTurns } = SUCCESS_RESULT; + const outcome = outcomeFromResult(noTurns); + assert.ok(outcome !== null); + assert.equal(outcome.numTurns, null); + }); +}); diff --git a/skills-contrib/drive-judge-harness/test/collect-run.test.ts b/skills-contrib/drive-judge-harness/test/collect-run.test.ts index f54a0cc01e..dc005efca6 100644 --- a/skills-contrib/drive-judge-harness/test/collect-run.test.ts +++ b/skills-contrib/drive-judge-harness/test/collect-run.test.ts @@ -61,6 +61,7 @@ function fakePrepared(overrides?: Partial): PreparedRun { skillBundleSha: 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb', prepareCommit, materialized: true, + preexistingTracePaths: [], ...overrides, }; } @@ -128,6 +129,87 @@ describe('collectRun — trace collection', () => { }); }); +describe('collectRun — preexistingTracePaths exclusion', () => { + it('returns only the run-emitted trace, not the pre-existing baseline trace', () => { + const baselinePath = join(runDir, 'baseline-trace.jsonl'); + const runEmittedPath = join(runDir, 'run-emitted-trace.jsonl'); + + writeFileSync(baselinePath, `${VALID_TRACE_LINE}\n`); + writeFileSync(runEmittedPath, `${VALID_TRACE_LINE}\n`); + + // Simulate: baseline-trace was present before the run started + const result = collectRun(fakePrepared({ preexistingTracePaths: [baselinePath] })); + + assert.equal(result.tracePaths.length, 1, 'only one trace should be returned'); + assert.ok( + result.tracePaths[0]?.endsWith('run-emitted-trace.jsonl'), + 'the returned trace must be the run-emitted one', + ); + assert.ok( + !result.tracePaths.some((p) => p.endsWith('baseline-trace.jsonl')), + 'the baseline-committed trace must not appear in results', + ); + assert.equal(result.untraced, false); + }); + + it('returns no traces when every valid jsonl is listed in preexistingTracePaths', () => { + const baselinePath = join(runDir, 'old-trace.jsonl'); + writeFileSync(baselinePath, `${VALID_TRACE_LINE}\n`); + + const result = collectRun(fakePrepared({ preexistingTracePaths: [baselinePath] })); + + assert.equal(result.tracePaths.length, 0); + assert.equal(result.matchedTrace, null); + assert.equal(result.untraced, true); + }); + + it('agent_id matching runs over the run-emitted set only', () => { + const baselinePath = join(runDir, 'baseline-trace.jsonl'); + const runEmittedPath = join(runDir, 'run-trace.jsonl'); + + // Both are valid traces but with different agent IDs. + const baselineLine = JSON.stringify({ + event_id: 'e1', + schema_version: '1', + ts: '2026-05-31T00:00:00.000Z', + project_run_id: 'proj-base', + orchestrator_agent_id: 'agent-baseline', + event_type: 'dispatch-start', + dispatch_id: 'd1', + dispatch_name: 'baseline', + subagent_type: 'generalPurpose', + model: null, + parent_dispatch_id: null, + }); + const runLine = JSON.stringify({ + event_id: 'e2', + schema_version: '1', + ts: '2026-05-31T00:00:00.000Z', + project_run_id: 'proj-run', + orchestrator_agent_id: 'agent-run', + event_type: 'dispatch-start', + dispatch_id: 'd2', + dispatch_name: 'run', + subagent_type: 'generalPurpose', + model: null, + parent_dispatch_id: null, + }); + + writeFileSync(baselinePath, `${baselineLine}\n`); + writeFileSync(runEmittedPath, `${runLine}\n`); + + const result = collectRun(fakePrepared({ preexistingTracePaths: [baselinePath] }), { + agentId: 'agent-baseline', + }); + + // 'agent-baseline' is only in the preexisting trace; the run-emitted set has + // only 'agent-run'. The exclusion must happen before matching. + assert.equal(result.tracePaths.length, 1); + assert.ok(result.tracePaths[0]?.endsWith('run-trace.jsonl')); + assert.ok(result.matchedTrace?.endsWith('run-trace.jsonl')); + }); +}); + describe('collectRun — diff excludes injected skill files (baseline-commit cut point)', () => { it('diff against prepareCommit omits skill bundle files committed at baseline', () => { // Agent changes: only a source file — not the skill files diff --git a/skills-contrib/drive-judge-harness/test/manifest.test.ts b/skills-contrib/drive-judge-harness/test/manifest.test.ts index 05a855571d..f4e98c1201 100644 --- a/skills-contrib/drive-judge-harness/test/manifest.test.ts +++ b/skills-contrib/drive-judge-harness/test/manifest.test.ts @@ -19,11 +19,15 @@ const dryRunManifest: RunManifest = { schema_version: '1', case_slug: 'slice-dedupe-generated-imports', model: 'claude-4.6-sonnet-high-thinking', + runtime: 'claude', status: 'dry-run', run_id: null, agent_id: null, trace_file: 'projects/x/trace.jsonl', tokens: null, + wall_clock_ms: null, + cost_usd: null, + num_turns: null, started_at: '2026-05-30T00:00:00.000Z', finished_at: null, notes: ['dry-run: live execution gate not satisfied'], @@ -59,6 +63,7 @@ describe('writeManifest', () => { status: 'finished', run_id: 'run-1', agent_id: 'agent-1', + wall_clock_ms: 5000, finished_at: '2026-05-30T00:10:00.000Z', tokens: { inputTokens: 100, diff --git a/skills-contrib/drive-judge-harness/test/prepare-run.test.ts b/skills-contrib/drive-judge-harness/test/prepare-run.test.ts index d892b111fc..24ed7963b4 100644 --- a/skills-contrib/drive-judge-harness/test/prepare-run.test.ts +++ b/skills-contrib/drive-judge-harness/test/prepare-run.test.ts @@ -162,6 +162,37 @@ describe('prepareRun', () => { const prepared = prepareRun(config, { materialize: mockMaterialize }); assert.equal(prepared.prepareCommit.length, 40); }); + + it('preexistingTracePaths is empty when the base checkout has no .jsonl files', () => { + const config: PrepareRunConfig = { + repoUnderTestDir: repoDir, + baseRef, + skillBundle: { repoDir, ref: bundleRef }, + runDir, + }; + const prepared = prepareRun(config, { materialize: mockMaterialize }); + assert.ok(Array.isArray(prepared.preexistingTracePaths)); + assert.equal(prepared.preexistingTracePaths.length, 0); + }); + + it('preexistingTracePaths lists committed .jsonl files present at baseline', () => { + // Add a .jsonl to the base checkout so it's in the worktree after prepare-run + mkdirSync(join(repoDir, 'wip', 'drive-trace'), { recursive: true }); + writeFileSync(join(repoDir, 'wip', 'drive-trace', 'old-trace.jsonl'), '{"event_id":"e0"}\n'); + gitIn(repoDir, 'add', '-A'); + gitIn(repoDir, 'commit', '-m', 'add old trace'); + const baseRefWithTrace = gitIn(repoDir, 'rev-parse', 'HEAD'); + + const config: PrepareRunConfig = { + repoUnderTestDir: repoDir, + baseRef: baseRefWithTrace, + skillBundle: { repoDir, ref: bundleRef }, + runDir, + }; + const prepared = prepareRun(config, { materialize: mockMaterialize }); + assert.equal(prepared.preexistingTracePaths.length, 1); + assert.ok(prepared.preexistingTracePaths[0]?.endsWith('old-trace.jsonl')); + }); }); describe('prepareRun + collectRun — empty-overlay cut point', () => { diff --git a/skills-contrib/drive-judge-harness/test/run-arm.test.ts b/skills-contrib/drive-judge-harness/test/run-arm.test.ts index 946b82fc8c..00ba5a3889 100644 --- a/skills-contrib/drive-judge-harness/test/run-arm.test.ts +++ b/skills-contrib/drive-judge-harness/test/run-arm.test.ts @@ -66,7 +66,15 @@ function mockRun(): OrchestratorRun { return { async *stream() {}, async wait() { - return { status: 'finished', runId: 'run-1', agentId: 'agent-1' }; + return { + status: 'finished' as const, + runId: 'run-1', + agentId: 'agent-1', + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, + }; }, }; } @@ -85,6 +93,7 @@ function baseConfig(): RunArmConfig { manifestFile: join(tmpDir, 'run.json'), live: true, apiKeyPresent: true, + runtime: 'claude', }; } diff --git a/skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts b/skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts index 0037f562a2..aa5e6b066f 100644 --- a/skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts +++ b/skills-contrib/drive-judge-harness/test/run-one-brief-cwd.test.ts @@ -26,7 +26,15 @@ function mockRun(): OrchestratorRun { return { async *stream() {}, async wait() { - return { status: 'finished', runId: null, agentId: null }; + return { + status: 'finished' as const, + runId: null, + agentId: null, + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, + }; }, }; } @@ -48,6 +56,7 @@ describe('runOneBrief — cwd thread-through', () => { runDir, live: true, apiKeyPresent: true, + runtime: 'claude', }, { createAgent }, ); @@ -71,6 +80,7 @@ describe('runOneBrief — cwd thread-through', () => { runDir, live: false, apiKeyPresent: true, + runtime: 'claude', }, { createAgent }, ); diff --git a/skills-contrib/drive-judge-harness/test/run-one-brief.test.ts b/skills-contrib/drive-judge-harness/test/run-one-brief.test.ts index 8c9a4bfbae..235cde360a 100644 --- a/skills-contrib/drive-judge-harness/test/run-one-brief.test.ts +++ b/skills-contrib/drive-judge-harness/test/run-one-brief.test.ts @@ -9,9 +9,11 @@ import { assemblePrompt, type CreateAgent, type OrchestratorRun, + type RunOutcome, type RunStreamEvent, runOneBrief, } from '../run-one-brief.ts'; +import type { TokenTotals } from '../usage.ts'; const GOLDEN_DIR = fileURLToPath( new URL('../../../projects/drive-judge-harness/assets/golden/', import.meta.url), @@ -28,11 +30,18 @@ afterEach(() => { const FIXED_NOW = () => '2026-05-30T12:00:00.000Z'; +const NULL_OUTCOME: RunOutcome = { + status: 'finished', + runId: null, + agentId: null, + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, +}; + /** A mock orchestrator run that yields synthetic stream events — no network. */ -function mockRun( - events: RunStreamEvent[], - outcome: Awaited>, -): OrchestratorRun { +function mockRun(events: RunStreamEvent[], outcome: RunOutcome): OrchestratorRun { return { async *stream() { for (const e of events) yield e; @@ -48,7 +57,7 @@ describe('runOneBrief — dry-run gate', () => { let called = false; const createAgent: CreateAgent = async () => { called = true; - return mockRun([], { status: 'finished', runId: null, agentId: null }); + return mockRun([], NULL_OUTCOME); }; const result = await runOneBrief( { @@ -59,6 +68,7 @@ describe('runOneBrief — dry-run gate', () => { runDir: dir, live: false, apiKeyPresent: true, + runtime: 'claude', }, { createAgent, now: FIXED_NOW }, ); @@ -66,13 +76,14 @@ describe('runOneBrief — dry-run gate', () => { assert.equal(result.createAgentCalled, false); assert.equal(result.status, 'dry-run'); assert.equal(result.manifest.tokens, null); + assert.equal(result.manifest.runtime, 'claude'); }); - it('does not call createAgent when live is true but no API key', async () => { + it('does not call createAgent when live is true but no API key (cursor runtime)', async () => { let called = false; const createAgent: CreateAgent = async () => { called = true; - return mockRun([], { status: 'finished', runId: null, agentId: null }); + return mockRun([], NULL_OUTCOME); }; const result = await runOneBrief( { @@ -83,6 +94,7 @@ describe('runOneBrief — dry-run gate', () => { runDir: dir, live: true, apiKeyPresent: false, + runtime: 'cursor', }, { createAgent, now: FIXED_NOW }, ); @@ -91,6 +103,30 @@ describe('runOneBrief — dry-run gate', () => { assert.match(result.manifest.notes.join(' '), /CURSOR_API_KEY is absent/); }); + it('does not call createAgent when live is true but no API key (claude runtime)', async () => { + let called = false; + const createAgent: CreateAgent = async () => { + called = true; + return mockRun([], NULL_OUTCOME); + }; + const result = await runOneBrief( + { + caseDir: CASE_DIR, + traceFile: join(dir, 'trace.jsonl'), + manifestFile: join(dir, 'run.json'), + model: 'pinned-model', + runDir: dir, + live: true, + apiKeyPresent: false, + runtime: 'claude', + }, + { createAgent, now: FIXED_NOW }, + ); + assert.equal(called, false); + assert.equal(result.status, 'dry-run'); + assert.match(result.manifest.notes.join(' '), /ANTHROPIC_API_KEY is absent/); + }); + it('writes a dry-run manifest to disk', async () => { const manifestFile = join(dir, 'run.json'); await runOneBrief( @@ -102,6 +138,7 @@ describe('runOneBrief — dry-run gate', () => { runDir: dir, live: false, apiKeyPresent: false, + runtime: 'claude', }, { now: FIXED_NOW }, ); @@ -110,6 +147,9 @@ describe('runOneBrief — dry-run gate', () => { assert.equal(parsed.case_slug, 'slice-dedupe-generated-imports'); assert.equal(parsed.model, 'pinned-model'); assert.equal(parsed.tokens, null); + assert.equal(parsed.runtime, 'claude'); + assert.equal(parsed.cost_usd, null); + assert.equal(parsed.num_turns, null); }); }); @@ -125,7 +165,15 @@ describe('runOneBrief — live path with mock SDK', () => { { kind: 'turn-ended', usage: { inputTokens: 50, outputTokens: 20 } }, ]; const createAgent: CreateAgent = async () => - mockRun(events, { status: 'finished', runId: 'run-42', agentId: 'agent-42' }); + mockRun(events, { + status: 'finished', + runId: 'run-42', + agentId: 'agent-42', + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, + }); const manifestFile = join(dir, 'run.json'); const result = await runOneBrief( @@ -137,6 +185,7 @@ describe('runOneBrief — live path with mock SDK', () => { runDir: dir, live: true, apiKeyPresent: true, + runtime: 'cursor', }, { createAgent, now: FIXED_NOW }, ); @@ -148,6 +197,7 @@ describe('runOneBrief — live path with mock SDK', () => { assert.equal(result.manifest.tokens?.outputTokens, 60); assert.equal(result.manifest.tokens?.totalTokens, 225); assert.equal(result.manifest.run_id, 'run-42'); + assert.equal(result.manifest.runtime, 'cursor'); const parsed = JSON.parse(readFileSync(manifestFile, 'utf8')); assert.equal(parsed.tokens.totalTokens, 225); @@ -166,6 +216,7 @@ describe('runOneBrief — live path with mock SDK', () => { runDir: dir, live: true, apiKeyPresent: true, + runtime: 'claude', }, { createAgent, now: FIXED_NOW }, ); @@ -179,7 +230,15 @@ describe('runOneBrief — live path with mock SDK', () => { { kind: 'turn-ended', usage: { inputTokens: 10, outputTokens: 2 } }, ]; const createAgent: CreateAgent = async () => - mockRun(events, { status: 'error', runId: 'run-err', agentId: null }); + mockRun(events, { + status: 'error', + runId: 'run-err', + agentId: null, + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, + }); const result = await runOneBrief( { caseDir: CASE_DIR, @@ -189,6 +248,7 @@ describe('runOneBrief — live path with mock SDK', () => { runDir: dir, live: true, apiKeyPresent: true, + runtime: 'claude', }, { createAgent, now: FIXED_NOW }, ); @@ -203,7 +263,15 @@ describe('runOneBrief — live path with mock SDK', () => { throw new Error('stream died'); }, async wait() { - return { status: 'finished', runId: 'unreached', agentId: null }; + return { + status: 'finished' as const, + runId: 'unreached', + agentId: null, + durationMs: null, + tokens: null, + costUsd: null, + numTurns: null, + }; }, }); const manifestFile = join(dir, 'run.json'); @@ -216,6 +284,7 @@ describe('runOneBrief — live path with mock SDK', () => { runDir: dir, live: true, apiKeyPresent: true, + runtime: 'claude', }, { createAgent, now: FIXED_NOW }, ); @@ -226,6 +295,118 @@ describe('runOneBrief — live path with mock SDK', () => { const parsed = JSON.parse(readFileSync(manifestFile, 'utf8')); assert.equal(parsed.status, 'error'); }); + + it('captures agent_id and wall_clock_ms from the outcome, and notes null tokens', async () => { + const createAgent: CreateAgent = async () => + mockRun([], { + status: 'finished', + runId: 'run-live-1', + agentId: 'agent-live-1', + durationMs: 87654, + tokens: null, + costUsd: null, + numTurns: null, + }); + + const result = await runOneBrief( + { + caseDir: CASE_DIR, + traceFile: join(dir, 'trace.jsonl'), + manifestFile: join(dir, 'run.json'), + model: 'pinned-model', + runDir: dir, + live: true, + apiKeyPresent: true, + runtime: 'cursor', + }, + { createAgent, now: FIXED_NOW }, + ); + + assert.equal(result.status, 'finished'); + assert.equal(result.manifest.agent_id, 'agent-live-1'); + assert.equal(result.manifest.wall_clock_ms, 87654); + assert.equal(result.manifest.tokens, null); + assert.ok( + result.manifest.notes.some((n) => + n.includes('tokens unavailable: @cursor/sdk local runtime emits no usage events'), + ), + ); + }); + + it('prefers outcome.tokens over per-turn accumulation when the runtime provides them', async () => { + const runtimeTokens: TokenTotals = { + inputTokens: 33, + outputTokens: 904, + cacheReadTokens: 230827, + cacheWriteTokens: 53995, + totalTokens: 285759, + }; + // Also emit a per-turn event with different values to confirm outcome wins. + const events: RunStreamEvent[] = [ + { kind: 'turn-ended', usage: { inputTokens: 1, outputTokens: 1 } }, + ]; + const createAgent: CreateAgent = async () => + mockRun(events, { + status: 'finished', + runId: 'sess-abc', + agentId: null, + durationMs: 16025, + tokens: runtimeTokens, + costUsd: 0.1839242, + numTurns: 9, + }); + + const manifestFile = join(dir, 'run.json'); + const result = await runOneBrief( + { + caseDir: CASE_DIR, + traceFile: join(dir, 'trace.jsonl'), + manifestFile, + model: 'pinned-model', + runDir: dir, + live: true, + apiKeyPresent: true, + runtime: 'claude', + }, + { createAgent, now: FIXED_NOW }, + ); + + assert.equal(result.status, 'finished'); + assert.equal(result.manifest.runtime, 'claude'); + // outcome.tokens takes priority over accumulated per-turn totals + assert.equal(result.manifest.tokens?.inputTokens, 33); + assert.equal(result.manifest.tokens?.outputTokens, 904); + assert.equal(result.manifest.tokens?.totalTokens, 285759); + assert.equal(result.manifest.cost_usd, 0.1839242); + assert.equal(result.manifest.num_turns, 9); + assert.equal(result.manifest.wall_clock_ms, 16025); + assert.equal(result.manifest.notes.length, 0, 'no notes when tokens are present'); + + const parsed = JSON.parse(readFileSync(manifestFile, 'utf8')); + assert.equal(parsed.runtime, 'claude'); + assert.equal(parsed.tokens.totalTokens, 285759); + assert.equal(parsed.cost_usd, 0.1839242); + assert.equal(parsed.num_turns, 9); + assert.equal(parsed.wall_clock_ms, 16025); + }); + + it('runtime:cursor produces runtime:cursor in the manifest', async () => { + const createAgent: CreateAgent = async () => mockRun([], NULL_OUTCOME); + const result = await runOneBrief( + { + caseDir: CASE_DIR, + traceFile: join(dir, 'trace.jsonl'), + manifestFile: join(dir, 'run.json'), + model: 'pinned-model', + runDir: dir, + live: true, + apiKeyPresent: true, + runtime: 'cursor', + }, + { createAgent, now: FIXED_NOW }, + ); + assert.equal(result.manifest.runtime, 'cursor'); + }); }); describe('assemblePrompt', () => { diff --git a/skills-contrib/drive-judge-harness/test/sdk-events.test.ts b/skills-contrib/drive-judge-harness/test/sdk-events.test.ts new file mode 100644 index 0000000000..4add9598d6 --- /dev/null +++ b/skills-contrib/drive-judge-harness/test/sdk-events.test.ts @@ -0,0 +1,147 @@ +import assert from 'node:assert/strict'; +import { describe, it } from 'node:test'; +import { + agentIdFromMessage, + isRecord, + outcomeFromResult, + streamEventFromMessage, +} from '../sdk-events.ts'; + +// Real shapes from @cursor/sdk@1.0.15 local runtime (captured via probe). +// Tests here must pass with @cursor/sdk NOT installed. + +const STATUS_MESSAGE = { + type: 'status', + agent_id: 'agent-abc123', + run_id: 'run-xyz789', + status: 'running', +}; + +const ASSISTANT_MESSAGE = { + type: 'assistant', + agent_id: 'agent-abc123', + run_id: 'run-xyz789', + message: { + content: [{ type: 'text', text: 'Hello from the orchestrator.' }], + }, +}; + +const WAIT_OUTCOME = { + id: 'run-xyz789', + status: 'finished', + result: 'done', + model: 'composer-2.5-fast', + durationMs: 42500, +}; + +describe('agentIdFromMessage', () => { + it('reads agent_id from a status message', () => { + assert.equal(agentIdFromMessage(STATUS_MESSAGE), 'agent-abc123'); + }); + + it('reads agent_id from an assistant message', () => { + assert.equal(agentIdFromMessage(ASSISTANT_MESSAGE), 'agent-abc123'); + }); + + it('returns null for the wait() outcome (no agent_id field)', () => { + assert.equal(agentIdFromMessage(WAIT_OUTCOME), null); + }); + + it('returns null for a non-object (string)', () => { + assert.equal(agentIdFromMessage('junk'), null); + }); + + it('returns null for a non-object (null)', () => { + assert.equal(agentIdFromMessage(null), null); + }); + + it('returns null for a record with no agent_id', () => { + assert.equal(agentIdFromMessage({ type: 'other' }), null); + }); +}); + +describe('outcomeFromResult', () => { + it('extracts runId, status=finished, and durationMs from the real outcome shape', () => { + const result = outcomeFromResult(WAIT_OUTCOME); + assert.equal(result.status, 'finished'); + assert.equal(result.runId, 'run-xyz789'); + assert.equal(result.durationMs, 42500); + }); + + it('maps status=error for a non-finished status', () => { + const result = outcomeFromResult({ ...WAIT_OUTCOME, status: 'failed' }); + assert.equal(result.status, 'error'); + }); + + it('returns durationMs:null when durationMs is absent', () => { + const { durationMs: _d, ...withoutDuration } = WAIT_OUTCOME; + const result = outcomeFromResult(withoutDuration); + assert.equal(result.durationMs, null); + }); + + it('returns durationMs:null when durationMs is not a number', () => { + const result = outcomeFromResult({ ...WAIT_OUTCOME, durationMs: 'not-a-number' }); + assert.equal(result.durationMs, null); + }); + + it('degrades to {status:error, runId:null, durationMs:null} for a non-record', () => { + const result = outcomeFromResult('not-an-object'); + assert.equal(result.status, 'error'); + assert.equal(result.runId, null); + assert.equal(result.durationMs, null); + }); + + it('degrades to {status:error, runId:null, durationMs:null} for null', () => { + const result = outcomeFromResult(null); + assert.equal(result.status, 'error'); + assert.equal(result.runId, null); + assert.equal(result.durationMs, null); + }); +}); + +describe('streamEventFromMessage', () => { + it('maps a status message to {kind:other} (no usage, no assistant text)', () => { + const event = streamEventFromMessage(STATUS_MESSAGE); + assert.equal(event.kind, 'other'); + }); + + it('maps an assistant message with text content to {kind:text}', () => { + const event = streamEventFromMessage(ASSISTANT_MESSAGE); + assert.equal(event.kind, 'text'); + assert.ok(event.kind === 'text' && event.text.includes('Hello from the orchestrator.')); + }); + + it('maps a turn-ended message with usage to {kind:turn-ended}', () => { + const turnEndedMsg = { + usage: { inputTokens: 100, outputTokens: 40, cacheReadTokens: 0, cacheWriteTokens: 0 }, + }; + const event = streamEventFromMessage(turnEndedMsg); + assert.equal(event.kind, 'turn-ended'); + assert.ok(event.kind === 'turn-ended' && event.usage.inputTokens === 100); + }); + + it('maps junk to {kind:other}', () => { + const event = streamEventFromMessage({ type: 'unknown-event' }); + assert.equal(event.kind, 'other'); + }); +}); + +describe('isRecord', () => { + it('returns true for plain objects', () => { + assert.equal(isRecord({}), true); + assert.equal(isRecord({ a: 1 }), true); + }); + + it('returns false for arrays', () => { + assert.equal(isRecord([]), false); + }); + + it('returns false for null', () => { + assert.equal(isRecord(null), false); + }); + + it('returns false for primitives', () => { + assert.equal(isRecord('string'), false); + assert.equal(isRecord(42), false); + }); +}); diff --git a/skills-contrib/drive-judge-harness/trace-files.ts b/skills-contrib/drive-judge-harness/trace-files.ts new file mode 100644 index 0000000000..efc7cf15b2 --- /dev/null +++ b/skills-contrib/drive-judge-harness/trace-files.ts @@ -0,0 +1,23 @@ +import { type Dirent, readdirSync } from 'node:fs'; +import { join } from 'pathe'; + +/** Recursively collect all `.jsonl` file paths under `dir`. + * Returns an empty array when `dir` does not exist or cannot be read. */ +export function findJsonlFiles(dir: string): string[] { + const results: string[] = []; + let entries: Dirent[]; + try { + entries = readdirSync(dir, { withFileTypes: true }); + } catch { + return results; + } + for (const entry of entries) { + const fullPath = join(dir, entry.name); + if (entry.isDirectory()) { + results.push(...findJsonlFiles(fullPath)); + } else if (entry.isFile() && entry.name.endsWith('.jsonl')) { + results.push(fullPath); + } + } + return results; +}