added more evals for agent (#933)

miguelg719 · greptile-apps[bot] · web-flow · commit 0c7c39be1115 · 2025-08-08T11:23:11.000-07:00
# why
To keep track on agent development progress and fix logging

# what changed
Added a few evals to the agent category, also patched logging image
buffers on aisdk providers

# test plan

---------

Co-authored-by: greptile-apps[bot] &lt;165735046+greptile-apps[bot]@users.noreply.github.com&gt;
diff --git a/evals/deterministic/bb.playwright.config.ts b/evals/deterministic/bb.playwright.config.ts
@@ -28,4 +28,5 @@ export default defineConfig({
       use: { ...devices["Desktop Chrome"] },
     },
   ],
+  timeout: 60000,
 });
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -398,6 +398,26 @@
     {
       "name": "heal_custom_dropdown",
       "categories": ["act"]
+    },
+    {
+      "name": "agent/trivago",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_maps",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_maps_2",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/youtube",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/sign_in",
+      "categories": ["agent"]
     }
   ]
 }
diff --git a/evals/evaluator.ts b/evals/evaluator.ts
@@ -19,13 +19,15 @@ import {
 } from "@/types/evaluator";
 import { LLMParsedResponse } from "@/lib/inference";
 import { LLMResponse } from "@/lib/llm/LLMClient";
+import { LogLine } from "@/types/log";
 
 dotenv.config();
 
 export class Evaluator {
   private stagehand: Stagehand;
   private modelName: AvailableModel;
   private modelClientOptions: ClientOptions | { apiKey: string };
+  private silentLogger: (message: LogLine) => void;
   // Define regex patterns directly in the class or as constants if preferred elsewhere
   private yesPattern = /^(YES|Y|TRUE|CORRECT|AFFIRMATIVE)/i;
   private noPattern = /^(NO|N|FALSE|INCORRECT|NEGATIVE)/i;
@@ -36,10 +38,12 @@ export class Evaluator {
     modelClientOptions?: ClientOptions,
   ) {
     this.stagehand = stagehand;
-    this.modelName = modelName || "google/gemini-2.0-flash";
+    this.modelName = modelName || "google/gemini-2.5-flash";
     this.modelClientOptions = modelClientOptions || {
       apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY || "",
     };
+    // Create a silent logger function that doesn't output anything
+    this.silentLogger = () => {};
   }
 
   /**
@@ -56,7 +60,8 @@ export class Evaluator {
       question,
       systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer.
           Return your response as a JSON object with the following format:
-          { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`,
+          { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
+          Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
       screenshotDelayMs = 1000,
       strictResponse = false,
     } = options;
@@ -71,7 +76,7 @@ export class Evaluator {
     const response = await llmClient.createChatCompletion<
       LLMParsedResponse<LLMResponse>
     >({
-      logger: this.stagehand.logger,
+      logger: this.silentLogger,
       options: {
         messages: [
           { role: "system", content: systemPrompt },
@@ -163,7 +168,8 @@ export class Evaluator {
       questions,
       systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer.
           Return your response as a JSON array, where each object corresponds to a question and has the following format:
-          { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`,
+          { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
+          Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
       screenshotDelayMs = 1000,
       strictResponse = false,
     } = options;
@@ -189,7 +195,7 @@ export class Evaluator {
     const response = await llmClient.createChatCompletion<
       LLMParsedResponse<LLMResponse>
     >({
-      logger: this.stagehand.logger,
+      logger: this.silentLogger,
       options: {
         messages: [
           {
diff --git a/evals/tasks/agent/google_maps.ts b/evals/tasks/agent/google_maps.ts
@@ -0,0 +1,72 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "../../evaluator";
+
+export const google_maps: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  modelName,
+}) => {
+  try {
+    await stagehand.page.goto("https://maps.google.com");
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
+    });
+
+    const agentResult = await agent.execute({
+      instruction:
+        "How long does it take to get from San Francisco to New York driving?",
+      maxSteps: 15,
+    });
+    logger.log(agentResult);
+
+    const evaluator = new Evaluator(stagehand);
+    const result = await evaluator.evaluate({
+      question:
+        "Does the page show the time it takes to drive from San Francisco to New York at all?",
+      strictResponse: true,
+    });
+
+    if (result.evaluation !== "YES" && result.evaluation !== "NO") {
+      return {
+        _success: false,
+        observations: "Evaluator provided an invalid response",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    if (result.evaluation === "YES") {
+      return {
+        _success: true,
+        observations: result.reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    } else {
+      return {
+        _success: false,
+        observations: result.reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+  } catch (error) {
+    return {
+      _success: false,
+      error: error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/google_maps_2.ts b/evals/tasks/agent/google_maps_2.ts
@@ -0,0 +1,92 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "../../evaluator";
+import { z } from "zod";
+
+export const google_maps_2: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  modelName,
+}) => {
+  try {
+    await stagehand.page.goto("https://maps.google.com");
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
+    });
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Search for the fastest walking route from La Puerta de Alcalá to La Puerta del Sol",
+      maxSteps: 15,
+    });
+    logger.log(agentResult);
+
+    const evaluator = new Evaluator(stagehand);
+    const result = await evaluator.evaluate({
+      question:
+        "Does the page show the fastest walking route from La Puerta de Alcalá to La Puerta del Sol? Does the distance between the two points show as 1.5 km?",
+      strictResponse: true,
+    });
+    const { distance } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction:
+        "Extract the distance for the fastest route walking to the decimal",
+      schema: z.object({
+        distance: z
+          .number()
+          .describe("The distance between the two destinations in km"),
+      }),
+    });
+
+    if (result.evaluation !== "YES" && result.evaluation !== "NO") {
+      return {
+        _success: false,
+        observations: "Evaluator provided an invalid response",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    if (result.evaluation === "YES") {
+      if (distance !== 1.5) {
+        return {
+          _success: false,
+          observations: "Distance is not 1.5 km",
+          debugUrl,
+          sessionUrl,
+          logs: logger.getLogs(),
+        };
+      }
+      return {
+        _success: true,
+        observations: result.reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    } else {
+      return {
+        _success: false,
+        observations: result.reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+  } catch (error) {
+    return {
+      _success: false,
+      error: error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/kayak.ts b/evals/tasks/agent/kayak.ts
@@ -37,7 +37,8 @@ export const kayak: EvalFunction = async ({
       };
     }
     const { evaluation, reasoning } = await evaluator.evaluate({
-      question: "Are the flights shown sorted by price?",
+      question:
+        "Are the flights shown sorted by price? Check the sort button in the top left corner of the page",
     });
 
     const success = evaluation === "YES";
diff --git a/evals/tasks/agent/sign_in.ts b/evals/tasks/agent/sign_in.ts
@@ -0,0 +1,55 @@
+import { EvalFunction } from "@/types/evals";
+
+export const sign_in: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  modelName,
+}) => {
+  try {
+    await stagehand.page.goto("https://v0-modern-login-flow.vercel.app/");
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
+    });
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Sign in with the email address 'test@browserbaser.com' and the password 'stagehand=goated' ",
+      maxSteps: 10,
+    });
+    logger.log(agentResult);
+    const url = await stagehand.page.url();
+
+    if (url === "https://v0-modern-login-flow.vercel.app/authorized") {
+      return {
+        _success: true,
+        observations: url,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: false,
+      observations: url,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error: error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/trivago.ts b/evals/tasks/agent/trivago.ts
@@ -0,0 +1,59 @@
+import { EvalFunction } from "@/types/evals";
+
+export const trivago: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  modelName,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.trivago.com/");
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}.The current page is ${await stagehand.page.title()}`,
+    });
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Find the cheapest room in the hotel H10 Tribeca in Madrid next weekend. Stop at the trivago page showing the results",
+      maxSteps: 13,
+    });
+    logger.log(agentResult);
+
+    const url = await stagehand.page.url();
+
+    if (
+      url.includes("hotel-h10-tribeca-madrid") &&
+      url.includes("trivago.com")
+    ) {
+      return {
+        _success: true,
+        observations: url,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    } else {
+      return {
+        _success: false,
+        observations: url,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+  } catch (error) {
+    return {
+      _success: false,
+      error: error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/youtube.ts b/evals/tasks/agent/youtube.ts
diff --git a/lib/llm/aisdk.ts b/lib/llm/aisdk.ts

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,8 @@ export const kayak: EvalFunction = async ({`
`37`	`37`	`};`
`38`	`38`	`}`
`39`	`39`	`const { evaluation, reasoning } = await evaluator.evaluate({`
`40`		`- question: "Are the flights shown sorted by price?",`
	`40`	`+ question:`
	`41`	`+ "Are the flights shown sorted by price? Check the sort button in the top left corner of the page",`
`41`	`42`	`});`
`42`	`43`
`43`	`44`	`const success = evaluation === "YES";`