Skip to content

Commit 3bc4a7c

Browse files
committed
unit and evals
1 parent b497987 commit 3bc4a7c

File tree

8 files changed

+219
-84
lines changed

8 files changed

+219
-84
lines changed

evals/evals.config.json

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
},
77
{
88
"name": "expect_act_timeout",
9-
"categories": ["unit"]
9+
"categories": ["unit", "combination"]
1010
},
1111
{
1212
"name": "extract_repo_name",
@@ -18,7 +18,7 @@
1818
},
1919
{
2020
"name": "instructions",
21-
"categories": ["unit"]
21+
"categories": ["unit", "combination"]
2222
},
2323
{
2424
"name": "bidnet",
@@ -71,7 +71,7 @@
7171
},
7272
{
7373
"name": "peeler_complex",
74-
"categories": ["unit"]
74+
"categories": ["unit", "combination"]
7575
},
7676
{
7777
"name": "sciquest",
@@ -257,11 +257,11 @@
257257
},
258258
{
259259
"name": "observe_iframes1",
260-
"categories": ["unit"]
260+
"categories": ["unit", "combination"]
261261
},
262262
{
263263
"name": "observe_iframes2",
264-
"categories": ["unit"]
264+
"categories": ["unit", "combination"]
265265
},
266266
{
267267
"name": "extract_hamilton_weather",
@@ -293,19 +293,19 @@
293293
},
294294
{
295295
"name": "scroll_50",
296-
"categories": ["unit"]
296+
"categories": ["unit", "combination"]
297297
},
298298
{
299299
"name": "scroll_75",
300-
"categories": ["unit"]
300+
"categories": ["unit", "combination"]
301301
},
302302
{
303303
"name": "nextChunk",
304-
"categories": ["unit"]
304+
"categories": ["unit", "combination"]
305305
},
306306
{
307307
"name": "prevChunk",
308-
"categories": ["unit"]
308+
"categories": ["unit", "combination"]
309309
}
310310
]
311311
}

evals/index.eval.ts

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,19 @@ import { filterByCategory, filterByEvalName, useTextExtract } from "./args";
1919
import { generateExperimentName } from "./utils";
2020
import { exactMatch, errorMatch } from "./scoring";
2121
import { tasksByName, MODELS } from "./taskConfig";
22-
import { Eval } from "braintrust";
22+
import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
2323
import { EvalFunction, SummaryResult, Testcase } from "@/types/evals";
2424
import { EvalLogger } from "./logger";
25-
import { AvailableModel } from "@/dist";
25+
import { AvailableModel, LLMClient } from "@/dist";
2626
import { env } from "./env";
2727
import dotenv from "dotenv";
2828
import { StagehandEvalError } from "@/types/stagehandErrors";
2929
import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
3030
import OpenAI from "openai";
3131
import { initStagehand } from "./initStagehand";
32+
import { AISdkClient } from "@/examples/external_clients/aisdk";
33+
import { google } from "@ai-sdk/google";
34+
3235
dotenv.config();
3336

3437
/**
@@ -230,13 +233,20 @@ const generateFilteredTestcases = (): Testcase[] => {
230233
}
231234

232235
// Execute the task
233-
const llmClient = new CustomOpenAIClient({
236+
let llmClient: LLMClient = new CustomOpenAIClient({
234237
modelName: input.modelName as AvailableModel,
235-
client: new OpenAI({
236-
apiKey: process.env.BRAINTRUST_API_KEY,
237-
baseURL: "https://api.braintrust.dev/v1/proxy",
238-
}),
238+
client: wrapOpenAI(
239+
new OpenAI({
240+
apiKey: process.env.BRAINTRUST_API_KEY,
241+
baseURL: "https://api.braintrust.dev/v1/proxy",
242+
}),
243+
),
239244
});
245+
if (input.modelName.startsWith("gemini")) {
246+
llmClient = new AISdkClient({
247+
model: wrapAISDKModel(google("gemini-2.0-flash")),
248+
});
249+
}
240250
const taskInput = await initStagehand({
241251
logger,
242252
llmClient,

evals/initStagehand.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ const StagehandConfig = {
4040
headless: false,
4141
enableCaching,
4242
domSettleTimeoutMs: 30_000,
43+
disablePino: true,
4344
};
4445

4546
/**
@@ -74,7 +75,7 @@ export const initStagehand = async ({
7475
...(domSettleTimeoutMs && { domSettleTimeoutMs }),
7576
actTimeoutMs,
7677
...configOverrides,
77-
logger: logger.log,
78+
logger: logger.log.bind(logger),
7879
};
7980

8081
const stagehand = new Stagehand(config);

evals/logger.ts

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,27 @@ import { Stagehand, LogLine } from "@/dist";
2323
*/
2424
function parseLogLine(logLine: LogLine): LogLineEval {
2525
try {
26+
let parsedAuxiliary: Record<string, unknown> | undefined;
27+
28+
if (logLine.auxiliary) {
29+
parsedAuxiliary = {};
30+
31+
for (const [key, entry] of Object.entries(logLine.auxiliary)) {
32+
try {
33+
parsedAuxiliary[key] =
34+
entry.type === "object" ? JSON.parse(entry.value) : entry.value;
35+
} catch (parseError) {
36+
console.warn(`Failed to parse auxiliary entry ${key}:`, parseError);
37+
// If parsing fails, use the raw value
38+
parsedAuxiliary[key] = entry.value;
39+
}
40+
}
41+
}
42+
2643
return {
2744
...logLine,
28-
// Remove the original auxiliary field in favor of parsedAuxiliary
2945
auxiliary: undefined,
30-
parsedAuxiliary: logLine.auxiliary
31-
? Object.fromEntries(
32-
Object.entries(logLine.auxiliary).map(([key, entry]) => [
33-
key,
34-
entry.type === "object" ? JSON.parse(entry.value) : entry.value,
35-
]),
36-
)
37-
: undefined,
46+
parsedAuxiliary,
3847
} as LogLineEval;
3948
} catch (e) {
4049
console.log("Error parsing log line", logLine);
@@ -56,10 +65,12 @@ function parseLogLine(logLine: LogLine): LogLineEval {
5665
* included in evaluation output.
5766
*/
5867
export class EvalLogger {
59-
logs: LogLineEval[] = [];
68+
private logs: LogLineEval[] = [];
6069
stagehand?: Stagehand;
6170

62-
constructor() {}
71+
constructor() {
72+
this.logs = [];
73+
}
6374

6475
/**
6576
* init:
@@ -78,9 +89,7 @@ export class EvalLogger {
7889
*/
7990
log(logLine: LogLine) {
8091
console.log(logLineToString(logLine));
81-
if (this.logs) {
82-
this.logs.push(parseLogLine(logLine));
83-
}
92+
this.logs.push(parseLogLine(logLine));
8493
}
8594

8695
/**
@@ -108,7 +117,7 @@ export class EvalLogger {
108117
* Retrieves the array of stored log lines.
109118
* Useful for returning logs after a task completes, for analysis or debugging.
110119
*/
111-
getLogs() {
112-
return this.logs;
120+
getLogs(): LogLineEval[] {
121+
return this.logs || [];
113122
}
114123
}

evals/taskConfig.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,8 @@
1212

1313
import fs from "fs";
1414
import path from "path";
15-
import { AvailableModel, AvailableModelSchema } from "@/dist";
15+
import { AvailableModel } from "@/dist";
1616
import { filterByEvalName } from "./args";
17-
import { UnsupportedModelError } from "@/types/stagehandErrors";
1817

1918
// The configuration file `evals.config.json` contains a list of tasks and their associated categories.
2019
const configPath = path.join(__dirname, "evals.config.json");
@@ -50,7 +49,14 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) {
5049
*/
5150
const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
5251
? process.env.EVAL_MODELS.split(",")
53-
: ["gpt-4o"];
52+
: [
53+
"gemini-2.0-flash",
54+
// "claude-3-5-sonnet-20240620",
55+
// "gpt-4o-mini",
56+
// "gpt-4o",
57+
// "llama-3.3-70b-versatile",
58+
// "llama3.3-70b",
59+
];
5460

5561
/**
5662
* getModelList:
@@ -62,9 +68,6 @@ const getModelList = (): string[] => {
6268
return DEFAULT_EVAL_MODELS;
6369
};
6470
const MODELS: AvailableModel[] = getModelList().map((model) => {
65-
if (!AvailableModelSchema.safeParse(model).success) {
66-
throw new UnsupportedModelError(getModelList(), "Running evals");
67-
}
6871
return model as AvailableModel;
6972
});
7073

evals/tasks/allrecipes.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ export const allrecipes: EvalFunction = async ({
1616
action: 'Type "chocolate chip cookies" in the search bar',
1717
});
1818
await stagehand.page.act({
19-
action: "hit enter",
19+
action: "click the search icon button",
2020
});
2121

2222
const recipeDetails = await stagehand.page.extract({

0 commit comments

Comments
 (0)