Skip to content

Commit 0c7c39b

Browse files
added more evals for agent (#933)
# why To keep track on agent development progress and fix logging # what changed Added a few evals to the agent category, also patched logging image buffers on aisdk providers # test plan --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent 8de7bd8 commit 0c7c39b

File tree

10 files changed

+368
-7
lines changed

10 files changed

+368
-7
lines changed

evals/deterministic/bb.playwright.config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ export default defineConfig({
2828
use: { ...devices["Desktop Chrome"] },
2929
},
3030
],
31+
timeout: 60000,
3132
});

evals/evals.config.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,26 @@
398398
{
399399
"name": "heal_custom_dropdown",
400400
"categories": ["act"]
401+
},
402+
{
403+
"name": "agent/trivago",
404+
"categories": ["agent"]
405+
},
406+
{
407+
"name": "agent/google_maps",
408+
"categories": ["agent"]
409+
},
410+
{
411+
"name": "agent/google_maps_2",
412+
"categories": ["agent"]
413+
},
414+
{
415+
"name": "agent/youtube",
416+
"categories": ["agent"]
417+
},
418+
{
419+
"name": "agent/sign_in",
420+
"categories": ["agent"]
401421
}
402422
]
403423
}

evals/evaluator.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ import {
1919
} from "@/types/evaluator";
2020
import { LLMParsedResponse } from "@/lib/inference";
2121
import { LLMResponse } from "@/lib/llm/LLMClient";
22+
import { LogLine } from "@/types/log";
2223

2324
dotenv.config();
2425

2526
export class Evaluator {
2627
private stagehand: Stagehand;
2728
private modelName: AvailableModel;
2829
private modelClientOptions: ClientOptions | { apiKey: string };
30+
private silentLogger: (message: LogLine) => void;
2931
// Define regex patterns directly in the class or as constants if preferred elsewhere
3032
private yesPattern = /^(YES|Y|TRUE|CORRECT|AFFIRMATIVE)/i;
3133
private noPattern = /^(NO|N|FALSE|INCORRECT|NEGATIVE)/i;
@@ -36,10 +38,12 @@ export class Evaluator {
3638
modelClientOptions?: ClientOptions,
3739
) {
3840
this.stagehand = stagehand;
39-
this.modelName = modelName || "google/gemini-2.0-flash";
41+
this.modelName = modelName || "google/gemini-2.5-flash";
4042
this.modelClientOptions = modelClientOptions || {
4143
apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY || "",
4244
};
45+
// Create a silent logger function that doesn't output anything
46+
this.silentLogger = () => {};
4347
}
4448

4549
/**
@@ -56,7 +60,8 @@ export class Evaluator {
5660
question,
5761
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer.
5862
Return your response as a JSON object with the following format:
59-
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`,
63+
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
64+
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
6065
screenshotDelayMs = 1000,
6166
strictResponse = false,
6267
} = options;
@@ -71,7 +76,7 @@ export class Evaluator {
7176
const response = await llmClient.createChatCompletion<
7277
LLMParsedResponse<LLMResponse>
7378
>({
74-
logger: this.stagehand.logger,
79+
logger: this.silentLogger,
7580
options: {
7681
messages: [
7782
{ role: "system", content: systemPrompt },
@@ -163,7 +168,8 @@ export class Evaluator {
163168
questions,
164169
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer.
165170
Return your response as a JSON array, where each object corresponds to a question and has the following format:
166-
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`,
171+
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
172+
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
167173
screenshotDelayMs = 1000,
168174
strictResponse = false,
169175
} = options;
@@ -189,7 +195,7 @@ export class Evaluator {
189195
const response = await llmClient.createChatCompletion<
190196
LLMParsedResponse<LLMResponse>
191197
>({
192-
logger: this.stagehand.logger,
198+
logger: this.silentLogger,
193199
options: {
194200
messages: [
195201
{

evals/tasks/agent/google_maps.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { EvalFunction } from "@/types/evals";
2+
import { Evaluator } from "../../evaluator";
3+
4+
export const google_maps: EvalFunction = async ({
5+
debugUrl,
6+
sessionUrl,
7+
stagehand,
8+
logger,
9+
modelName,
10+
}) => {
11+
try {
12+
await stagehand.page.goto("https://maps.google.com");
13+
14+
const agent = stagehand.agent({
15+
model: modelName,
16+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
17+
instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
18+
});
19+
20+
const agentResult = await agent.execute({
21+
instruction:
22+
"How long does it take to get from San Francisco to New York driving?",
23+
maxSteps: 15,
24+
});
25+
logger.log(agentResult);
26+
27+
const evaluator = new Evaluator(stagehand);
28+
const result = await evaluator.evaluate({
29+
question:
30+
"Does the page show the time it takes to drive from San Francisco to New York at all?",
31+
strictResponse: true,
32+
});
33+
34+
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
35+
return {
36+
_success: false,
37+
observations: "Evaluator provided an invalid response",
38+
debugUrl,
39+
sessionUrl,
40+
logs: logger.getLogs(),
41+
};
42+
}
43+
44+
if (result.evaluation === "YES") {
45+
return {
46+
_success: true,
47+
observations: result.reasoning,
48+
debugUrl,
49+
sessionUrl,
50+
logs: logger.getLogs(),
51+
};
52+
} else {
53+
return {
54+
_success: false,
55+
observations: result.reasoning,
56+
debugUrl,
57+
sessionUrl,
58+
logs: logger.getLogs(),
59+
};
60+
}
61+
} catch (error) {
62+
return {
63+
_success: false,
64+
error: error,
65+
debugUrl,
66+
sessionUrl,
67+
logs: logger.getLogs(),
68+
};
69+
} finally {
70+
await stagehand.close();
71+
}
72+
};

evals/tasks/agent/google_maps_2.ts

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import { EvalFunction } from "@/types/evals";
2+
import { Evaluator } from "../../evaluator";
3+
import { z } from "zod";
4+
5+
export const google_maps_2: EvalFunction = async ({
6+
debugUrl,
7+
sessionUrl,
8+
stagehand,
9+
logger,
10+
modelName,
11+
}) => {
12+
try {
13+
await stagehand.page.goto("https://maps.google.com");
14+
15+
const agent = stagehand.agent({
16+
model: modelName,
17+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
18+
instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
19+
});
20+
21+
const agentResult = await agent.execute({
22+
instruction:
23+
"Search for the fastest walking route from La Puerta de Alcalá to La Puerta del Sol",
24+
maxSteps: 15,
25+
});
26+
logger.log(agentResult);
27+
28+
const evaluator = new Evaluator(stagehand);
29+
const result = await evaluator.evaluate({
30+
question:
31+
"Does the page show the fastest walking route from La Puerta de Alcalá to La Puerta del Sol? Does the distance between the two points show as 1.5 km?",
32+
strictResponse: true,
33+
});
34+
const { distance } = await stagehand.page.extract({
35+
modelName: "google/gemini-2.5-flash",
36+
instruction:
37+
"Extract the distance for the fastest route walking to the decimal",
38+
schema: z.object({
39+
distance: z
40+
.number()
41+
.describe("The distance between the two destinations in km"),
42+
}),
43+
});
44+
45+
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
46+
return {
47+
_success: false,
48+
observations: "Evaluator provided an invalid response",
49+
debugUrl,
50+
sessionUrl,
51+
logs: logger.getLogs(),
52+
};
53+
}
54+
55+
if (result.evaluation === "YES") {
56+
if (distance !== 1.5) {
57+
return {
58+
_success: false,
59+
observations: "Distance is not 1.5 km",
60+
debugUrl,
61+
sessionUrl,
62+
logs: logger.getLogs(),
63+
};
64+
}
65+
return {
66+
_success: true,
67+
observations: result.reasoning,
68+
debugUrl,
69+
sessionUrl,
70+
logs: logger.getLogs(),
71+
};
72+
} else {
73+
return {
74+
_success: false,
75+
observations: result.reasoning,
76+
debugUrl,
77+
sessionUrl,
78+
logs: logger.getLogs(),
79+
};
80+
}
81+
} catch (error) {
82+
return {
83+
_success: false,
84+
error: error,
85+
debugUrl,
86+
sessionUrl,
87+
logs: logger.getLogs(),
88+
};
89+
} finally {
90+
await stagehand.close();
91+
}
92+
};

evals/tasks/agent/kayak.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ export const kayak: EvalFunction = async ({
3737
};
3838
}
3939
const { evaluation, reasoning } = await evaluator.evaluate({
40-
question: "Are the flights shown sorted by price?",
40+
question:
41+
"Are the flights shown sorted by price? Check the sort button in the top left corner of the page",
4142
});
4243

4344
const success = evaluation === "YES";

evals/tasks/agent/sign_in.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const sign_in: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
modelName,
9+
}) => {
10+
try {
11+
await stagehand.page.goto("https://v0-modern-login-flow.vercel.app/");
12+
13+
const agent = stagehand.agent({
14+
model: modelName,
15+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
16+
instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
17+
});
18+
19+
const agentResult = await agent.execute({
20+
instruction:
21+
"Sign in with the email address '[email protected]' and the password 'stagehand=goated' ",
22+
maxSteps: 10,
23+
});
24+
logger.log(agentResult);
25+
const url = await stagehand.page.url();
26+
27+
if (url === "https://v0-modern-login-flow.vercel.app/authorized") {
28+
return {
29+
_success: true,
30+
observations: url,
31+
debugUrl,
32+
sessionUrl,
33+
logs: logger.getLogs(),
34+
};
35+
}
36+
37+
return {
38+
_success: false,
39+
observations: url,
40+
debugUrl,
41+
sessionUrl,
42+
logs: logger.getLogs(),
43+
};
44+
} catch (error) {
45+
return {
46+
_success: false,
47+
error: error,
48+
debugUrl,
49+
sessionUrl,
50+
logs: logger.getLogs(),
51+
};
52+
} finally {
53+
await stagehand.close();
54+
}
55+
};

evals/tasks/agent/trivago.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const trivago: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
modelName,
9+
}) => {
10+
try {
11+
await stagehand.page.goto("https://www.trivago.com/");
12+
13+
const agent = stagehand.agent({
14+
model: modelName,
15+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
16+
instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}.The current page is ${await stagehand.page.title()}`,
17+
});
18+
19+
const agentResult = await agent.execute({
20+
instruction:
21+
"Find the cheapest room in the hotel H10 Tribeca in Madrid next weekend. Stop at the trivago page showing the results",
22+
maxSteps: 13,
23+
});
24+
logger.log(agentResult);
25+
26+
const url = await stagehand.page.url();
27+
28+
if (
29+
url.includes("hotel-h10-tribeca-madrid") &&
30+
url.includes("trivago.com")
31+
) {
32+
return {
33+
_success: true,
34+
observations: url,
35+
debugUrl,
36+
sessionUrl,
37+
logs: logger.getLogs(),
38+
};
39+
} else {
40+
return {
41+
_success: false,
42+
observations: url,
43+
debugUrl,
44+
sessionUrl,
45+
logs: logger.getLogs(),
46+
};
47+
}
48+
} catch (error) {
49+
return {
50+
_success: false,
51+
error: error,
52+
debugUrl,
53+
sessionUrl,
54+
logs: logger.getLogs(),
55+
};
56+
} finally {
57+
await stagehand.close();
58+
}
59+
};

0 commit comments

Comments
 (0)