braintrustdata · aswink · Mar 12, 2026 · Mar 12, 2026 · Mar 16, 2026 · Mar 17, 2026
diff --git a/js/dev/server.ts b/js/dev/server.ts
@@ -117,7 +117,7 @@ export function runDevServer(
 
         evalDefs[name] = {
           parameters,
-          scores: evaluator.scores.map((score, idx) => ({
+          scores: (evaluator.scores ?? []).map((score, idx) => ({
             name: scorerName(score, idx),
           })),
         };
@@ -209,7 +209,7 @@ export function runDevServer(
           {
             ...evaluator,
             data: evalData.data,
-            scores: evaluator.scores.concat(
+            scores: (evaluator.scores ?? []).concat(
               scores?.map((score) =>
                 makeScorer(
                   state,

diff --git a/js/src/cli/functions/infer-source.ts b/js/src/cli/functions/infer-source.ts
@@ -85,7 +85,7 @@ export async function findCodeDefinition({
       fn =
         location.position.type === "task"
           ? evaluator.task
-          : evaluator.scores[location.position.index];
+          : (evaluator.scores ?? [])[location.position.index];
     }
   } else if (location.type === "function") {
     fn = outFileModule.functions[location.index].handler;

diff --git a/js/src/cli/functions/upload.ts b/js/src/cli/functions/upload.ts
@@ -180,23 +180,25 @@ export async function uploadHandleBundles({
           function_type: "task",
           origin,
         },
-        ...evaluator.evaluator.scores.map((score, i): BundledFunctionSpec => {
-          const name = scorerName(score, i);
-          return {
-            ...baseInfo,
-            // There is a very small chance that someone names a function with the same convention, but
-            // let's assume it's low enough that it doesn't matter.
-            ...formatNameAndSlug(["eval", namePrefix, "scorer", name]),
-            description: `Score ${name} for eval ${namePrefix}`,
-            location: {
-              type: "experiment",
-              eval_name: evaluator.evaluator.evalName,
-              position: { type: "scorer", index: i },
-            },
-            function_type: "scorer",
-            origin,
-          };
-        }),
+        ...(evaluator.evaluator.scores ?? []).map(
+          (score, i): BundledFunctionSpec => {
+            const name = scorerName(score, i);
+            return {
+              ...baseInfo,
+              // There is a very small chance that someone names a function with the same convention, but
+              // let's assume it's low enough that it doesn't matter.
+              ...formatNameAndSlug(["eval", namePrefix, "scorer", name]),
+              description: `Score ${name} for eval ${namePrefix}`,
+              location: {
+                type: "experiment",
+                eval_name: evaluator.evaluator.evalName,
+                position: { type: "scorer", index: i },
+              },
+              function_type: "scorer",
+              origin,
+            };
+          },
+        ),
       ];
 
       bundleSpecs.push(...fileSpecs);
@@ -219,7 +221,7 @@ export async function uploadHandleBundles({
                   serializeRemoteEvalParametersContainer(resolvedParameters),
               }
             : {}),
-          scores: evaluator.evaluator.scores.map((score, i) => ({
+          scores: (evaluator.evaluator.scores ?? []).map((score, i) => ({
             name: scorerName(score, i),
           })),
         };

diff --git a/js/src/exports.ts b/js/src/exports.ts
@@ -185,6 +185,7 @@ export type {
   EvalResult,
   EvalScorerArgs,
   EvalScorer,
+  EvalClassifier,
   EvaluatorDef,
   EvaluatorFile,
   ReporterBody,

diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts
@@ -179,7 +179,6 @@ describe("runEvaluator", () => {
         new NoopProgressReporter(),
         [],
         undefined,
-        true,
       );
 
       expect(out.results.every((r) => Object.keys(r.scores).length === 0)).toBe(
@@ -207,7 +206,6 @@ describe("runEvaluator", () => {
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -237,7 +235,6 @@ describe("runEvaluator", () => {
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -271,7 +268,6 @@ describe("runEvaluator", () => {
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -297,7 +293,6 @@ describe("runEvaluator", () => {
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -477,7 +472,7 @@ test("trialIndex is passed to task", async () => {
   // All results should be correct
   results.forEach((result) => {
     expect(result.input).toBe(1);
-    expect(result.expected).toBe(2);
+    expect("expected" in result ? result.expected : undefined).toBe(2);
     expect(result.output).toBe(2);
     expect(result.error).toBeUndefined();
   });
@@ -575,9 +570,8 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn
 
 test("Eval with returnResults: false produces empty results but valid summary", async () => {
   const result = await Eval(
-    "test-no-results",
+    "test-no-results-project",
     {
-      projectName: "test-no-results-project",
       data: [
         { input: "hello", expected: "hello world" },
         { input: "test", expected: "test world" },
@@ -615,9 +609,8 @@ test("Eval with returnResults: false produces empty results but valid summary",
 
 test("Eval with returnResults: true collects all results", async () => {
   const result = await Eval(
-    "test-with-results",
+    "test-with-results-project",
     {
-      projectName: "test-with-results-project",
       data: [
         { input: "hello", expected: "hello world" },
         { input: "test", expected: "test world" },
@@ -668,7 +661,7 @@ test("tags can be appended and logged to root span", async () => {
       evalName: "js-tags-append",
       data: [{ input: "hello", expected: "hello world", tags: initialTags }],
       task: (input, hooks) => {
-        for (const t of appendedTags) hooks.tags.push(t);
+        for (const t of appendedTags) hooks.tags!.push(t);
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
@@ -825,7 +818,7 @@ test("scorer spans have purpose='scorer' attribute", async () => {
       data: [{ input: "hello", expected: "hello" }],
       task: async (input: string) => input,
       scores: [
-        (args: { input: string; output: string; expected: string }) => ({
+        (args: { output: string; expected?: string }) => ({
           name: "simple_scorer",
           score: args.output === args.expected ? 1 : 0,
         }),
@@ -972,11 +965,12 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
         {
           name: "test-prompt",
           slug: "test-prompt",
           metadata,
-        },
+        } as any,
       );
 
       const mockProjectMap = {
@@ -1001,10 +995,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt" } as any,
       );
 
       const mockProjectMap = {
@@ -1027,11 +1019,12 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
         {
           name: "test-prompt",
           slug: "test-prompt",
           environments: ["production"],
-        },
+        } as any,
       );
 
       const mockProjectMap = {
@@ -1054,11 +1047,12 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
         {
           name: "test-prompt",
           slug: "test-prompt",
           environments: ["staging", "production"],
-        },
+        } as any,
       );
 
       const mockProjectMap = {
@@ -1084,10 +1078,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt" } as any,
       );
 
       const mockProjectMap = {
@@ -1130,11 +1122,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-          tags,
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt", tags } as any,
       );
 
       const mockProjectMap = {
@@ -1159,10 +1148,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt" } as any,
       );
 
       const mockProjectMap = {
@@ -1504,3 +1491,126 @@ test("Eval with enableCache: true (default) uses span cache", async () => {
   expect(startSpy).toHaveBeenCalled();
   expect(stopSpy).toHaveBeenCalled();
 });
+
+test("classifier-only evaluator populates classifications field", async () => {
+  const result = await Eval(
+    "test-classifier-only",
+    {
+      data: [{ input: "hello", expected: "greeting" }],
+      task: (input) => input,
+      classifiers: [
+        () => ({
+          name: "category",
+          id: "greeting",
+          label: "Greeting",
+          metadata: { source: "unit-test" },
+        }),
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  const r = result.results[0];
+  expect(r.classifications?.category).toEqual([
+    {
+      id: "greeting",
+      label: "Greeting",
+      metadata: { source: "unit-test" },
+    },
+  ]);
+});
+
+test("scorer-only evaluator populates scores field", async () => {
+  const result = await Eval(
+    "test-scorer-only",
+    {
+      data: [{ input: "hello", expected: "hello" }],
+      task: (input) => input,
+      scores: [
+        (args) => ({
+          name: "exact_match",
+          score: args.output === args.expected ? 1 : 0,
+        }),
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect(result.results[0].scores.exact_match).toBe(1);
+  expect(result.results[0].classifications).toBeUndefined();
+});
+
+test("multiple classifiers returning the same name append items correctly", async () => {
+  const result = await Eval(
+    "test-classifier-append",
+    {
+      data: [{ input: "hello" }],
+      task: (input) => input,
+      classifiers: [
+        () => [
+          { name: "category", id: "greeting", label: "Greeting" },
+          { name: "category", id: "informal", label: "Informal" },
+        ],
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect(result.results[0].classifications?.category).toHaveLength(2);
+  expect(result.results[0].classifications?.category[0]).toEqual({
+    id: "greeting",
+    label: "Greeting",
+  });
+  expect(result.results[0].classifications?.category[1]).toEqual({
+    id: "informal",
+    label: "Informal",
+  });
+});
+
+test("mixed evaluator populates both scores and classifications", async () => {
+  const result = await Eval(
+    "test-score-and-classify",
+    {
+      data: [{ input: "hello", expected: "hello" }],
+      task: (input) => input,
+      scores: [
+        (args) => ({
+          name: "exact_match",
+          score: args.output === args.expected ? 1 : 0,
+        }),
+      ],
+      classifiers: [
+        () => ({ name: "category", id: "greeting", label: "Greeting" }),
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect(result.results[0].scores.exact_match).toBe(1);
+  expect(result.results[0].classifications?.category).toEqual([
+    { id: "greeting", label: "Greeting" },
+  ]);
+});
+
+test("malformed classifier output fails clearly", async () => {
+  const result = await Eval(
+    "test-invalid-classifier-output",
+    {
+      data: [{ input: "hello" }],
+      task: (input) => input,
+      classifiers: [() => ({}) as never],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect((result.results[0] as any).metadata?.classifier_errors).toMatchObject({
+    classifier_0: expect.stringMatching(
+      /must return classifications with a non-empty string name/,
+    ),
+  });
+});