Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions js/dev/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ export function runDevServer(

evalDefs[name] = {
parameters,
scores: evaluator.scores.map((score, idx) => ({
scores: (evaluator.scores ?? []).map((score, idx) => ({
name: scorerName(score, idx),
})),
};
Expand Down Expand Up @@ -209,7 +209,7 @@ export function runDevServer(
{
...evaluator,
data: evalData.data,
scores: evaluator.scores.concat(
scores: (evaluator.scores ?? []).concat(
scores?.map((score) =>
makeScorer(
state,
Expand Down
2 changes: 1 addition & 1 deletion js/src/cli/functions/infer-source.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export async function findCodeDefinition({
fn =
location.position.type === "task"
? evaluator.task
: evaluator.scores[location.position.index];
: (evaluator.scores ?? [])[location.position.index];
}
} else if (location.type === "function") {
fn = outFileModule.functions[location.index].handler;
Expand Down
38 changes: 20 additions & 18 deletions js/src/cli/functions/upload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,23 +180,25 @@ export async function uploadHandleBundles({
function_type: "task",
origin,
},
...evaluator.evaluator.scores.map((score, i): BundledFunctionSpec => {
const name = scorerName(score, i);
return {
...baseInfo,
// There is a very small chance that someone names a function with the same convention, but
// let's assume it's low enough that it doesn't matter.
...formatNameAndSlug(["eval", namePrefix, "scorer", name]),
description: `Score ${name} for eval ${namePrefix}`,
location: {
type: "experiment",
eval_name: evaluator.evaluator.evalName,
position: { type: "scorer", index: i },
},
function_type: "scorer",
origin,
};
}),
...(evaluator.evaluator.scores ?? []).map(
(score, i): BundledFunctionSpec => {
const name = scorerName(score, i);
return {
...baseInfo,
// There is a very small chance that someone names a function with the same convention, but
// let's assume it's low enough that it doesn't matter.
...formatNameAndSlug(["eval", namePrefix, "scorer", name]),
description: `Score ${name} for eval ${namePrefix}`,
location: {
type: "experiment",
eval_name: evaluator.evaluator.evalName,
position: { type: "scorer", index: i },
},
function_type: "scorer",
origin,
};
},
),
];

bundleSpecs.push(...fileSpecs);
Expand All @@ -219,7 +221,7 @@ export async function uploadHandleBundles({
serializeRemoteEvalParametersContainer(resolvedParameters),
}
: {}),
scores: evaluator.evaluator.scores.map((score, i) => ({
scores: (evaluator.evaluator.scores ?? []).map((score, i) => ({
name: scorerName(score, i),
})),
};
Expand Down
1 change: 1 addition & 0 deletions js/src/exports.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ export type {
EvalResult,
EvalScorerArgs,
EvalScorer,
EvalClassifier,
EvaluatorDef,
EvaluatorFile,
ReporterBody,
Expand Down
174 changes: 142 additions & 32 deletions js/src/framework.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ describe("runEvaluator", () => {
new NoopProgressReporter(),
[],
undefined,
true,
);

expect(out.results.every((r) => Object.keys(r.scores).length === 0)).toBe(
Expand Down Expand Up @@ -207,7 +206,6 @@ describe("runEvaluator", () => {
new NoopProgressReporter(),
[],
undefined,
true,
);

expect(
Expand Down Expand Up @@ -237,7 +235,6 @@ describe("runEvaluator", () => {
new NoopProgressReporter(),
[],
undefined,
true,
);

expect(
Expand Down Expand Up @@ -271,7 +268,6 @@ describe("runEvaluator", () => {
new NoopProgressReporter(),
[],
undefined,
true,
);

expect(
Expand All @@ -297,7 +293,6 @@ describe("runEvaluator", () => {
new NoopProgressReporter(),
[],
undefined,
true,
);

expect(
Expand Down Expand Up @@ -477,7 +472,7 @@ test("trialIndex is passed to task", async () => {
// All results should be correct
results.forEach((result) => {
expect(result.input).toBe(1);
expect(result.expected).toBe(2);
expect("expected" in result ? result.expected : undefined).toBe(2);
expect(result.output).toBe(2);
expect(result.error).toBeUndefined();
});
Expand Down Expand Up @@ -575,9 +570,8 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn

test("Eval with returnResults: false produces empty results but valid summary", async () => {
const result = await Eval(
"test-no-results",
"test-no-results-project",
{
projectName: "test-no-results-project",
data: [
{ input: "hello", expected: "hello world" },
{ input: "test", expected: "test world" },
Expand Down Expand Up @@ -615,9 +609,8 @@ test("Eval with returnResults: false produces empty results but valid summary",

test("Eval with returnResults: true collects all results", async () => {
const result = await Eval(
"test-with-results",
"test-with-results-project",
{
projectName: "test-with-results-project",
data: [
{ input: "hello", expected: "hello world" },
{ input: "test", expected: "test world" },
Expand Down Expand Up @@ -668,7 +661,7 @@ test("tags can be appended and logged to root span", async () => {
evalName: "js-tags-append",
data: [{ input: "hello", expected: "hello world", tags: initialTags }],
task: (input, hooks) => {
for (const t of appendedTags) hooks.tags.push(t);
for (const t of appendedTags) hooks.tags!.push(t);
return input;
},
scores: [() => ({ name: "simple_scorer", score: 0.8 })],
Expand Down Expand Up @@ -825,7 +818,7 @@ test("scorer spans have purpose='scorer' attribute", async () => {
data: [{ input: "hello", expected: "hello" }],
task: async (input: string) => input,
scores: [
(args: { input: string; output: string; expected: string }) => ({
(args: { output: string; expected?: string }) => ({
name: "simple_scorer",
score: args.output === args.expected ? 1 : 0,
}),
Expand Down Expand Up @@ -972,11 +965,12 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{
name: "test-prompt",
slug: "test-prompt",
metadata,
},
} as any,
);

const mockProjectMap = {
Expand All @@ -1001,10 +995,8 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
{
name: "test-prompt",
slug: "test-prompt",
},
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{ name: "test-prompt", slug: "test-prompt" } as any,
);

const mockProjectMap = {
Expand All @@ -1027,11 +1019,12 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{
name: "test-prompt",
slug: "test-prompt",
environments: ["production"],
},
} as any,
);

const mockProjectMap = {
Expand All @@ -1054,11 +1047,12 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{
name: "test-prompt",
slug: "test-prompt",
environments: ["staging", "production"],
},
} as any,
);

const mockProjectMap = {
Expand All @@ -1084,10 +1078,8 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
{
name: "test-prompt",
slug: "test-prompt",
},
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{ name: "test-prompt", slug: "test-prompt" } as any,
);

const mockProjectMap = {
Expand Down Expand Up @@ -1130,11 +1122,8 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
{
name: "test-prompt",
slug: "test-prompt",
tags,
},
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{ name: "test-prompt", slug: "test-prompt", tags } as any,
);

const mockProjectMap = {
Expand All @@ -1159,10 +1148,8 @@ describe("framework2 metadata support", () => {
options: { model: "gpt-4" },
},
[],
{
name: "test-prompt",
slug: "test-prompt",
},
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{ name: "test-prompt", slug: "test-prompt" } as any,
);

const mockProjectMap = {
Expand Down Expand Up @@ -1504,3 +1491,126 @@ test("Eval with enableCache: true (default) uses span cache", async () => {
expect(startSpy).toHaveBeenCalled();
expect(stopSpy).toHaveBeenCalled();
});

test("classifier-only evaluator populates classifications field", async () => {
const result = await Eval(
"test-classifier-only",
{
data: [{ input: "hello", expected: "greeting" }],
task: (input) => input,
classifiers: [
() => ({
name: "category",
id: "greeting",
label: "Greeting",
metadata: { source: "unit-test" },
}),
],
},
{ noSendLogs: true, returnResults: true },
);

expect(result.results).toHaveLength(1);
const r = result.results[0];
expect(r.classifications?.category).toEqual([
{
id: "greeting",
label: "Greeting",
metadata: { source: "unit-test" },
},
]);
});

test("scorer-only evaluator populates scores field", async () => {
const result = await Eval(
"test-scorer-only",
{
data: [{ input: "hello", expected: "hello" }],
task: (input) => input,
scores: [
(args) => ({
name: "exact_match",
score: args.output === args.expected ? 1 : 0,
}),
],
},
{ noSendLogs: true, returnResults: true },
);

expect(result.results).toHaveLength(1);
expect(result.results[0].scores.exact_match).toBe(1);
expect(result.results[0].classifications).toBeUndefined();
});

test("multiple classifiers returning the same name append items correctly", async () => {
const result = await Eval(
"test-classifier-append",
{
data: [{ input: "hello" }],
task: (input) => input,
classifiers: [
() => [
{ name: "category", id: "greeting", label: "Greeting" },
{ name: "category", id: "informal", label: "Informal" },
],
],
},
{ noSendLogs: true, returnResults: true },
);

expect(result.results).toHaveLength(1);
expect(result.results[0].classifications?.category).toHaveLength(2);
expect(result.results[0].classifications?.category[0]).toEqual({
id: "greeting",
label: "Greeting",
});
expect(result.results[0].classifications?.category[1]).toEqual({
id: "informal",
label: "Informal",
});
});

test("mixed evaluator populates both scores and classifications", async () => {
const result = await Eval(
"test-score-and-classify",
{
data: [{ input: "hello", expected: "hello" }],
task: (input) => input,
scores: [
(args) => ({
name: "exact_match",
score: args.output === args.expected ? 1 : 0,
}),
],
classifiers: [
() => ({ name: "category", id: "greeting", label: "Greeting" }),
],
},
{ noSendLogs: true, returnResults: true },
);

expect(result.results).toHaveLength(1);
expect(result.results[0].scores.exact_match).toBe(1);
expect(result.results[0].classifications?.category).toEqual([
{ id: "greeting", label: "Greeting" },
]);
});

test("malformed classifier output fails clearly", async () => {
const result = await Eval(
"test-invalid-classifier-output",
{
data: [{ input: "hello" }],
task: (input) => input,
classifiers: [() => ({}) as never],
},
{ noSendLogs: true, returnResults: true },
);

expect(result.results).toHaveLength(1);
expect((result.results[0] as any).metadata?.classifier_errors).toMatchObject({
classifier_0: expect.stringMatching(
/must return classifications with a non-empty string name/,
),
});
});
Loading
Loading