unit and evals

kamath · kamath · commit 3bc4a7c057dc · 2025-04-04T22:53:23.000-07:00
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -6,7 +6,7 @@
     },
     {
       "name": "expect_act_timeout",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "extract_repo_name",
@@ -18,7 +18,7 @@
     },
     {
       "name": "instructions",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "bidnet",
@@ -71,7 +71,7 @@
     },
     {
       "name": "peeler_complex",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "sciquest",
@@ -257,11 +257,11 @@
     },
     {
       "name": "observe_iframes1",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "observe_iframes2",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "extract_hamilton_weather",
@@ -293,19 +293,19 @@
     },
     {
       "name": "scroll_50",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "scroll_75",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "nextChunk",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     },
     {
       "name": "prevChunk",
-      "categories": ["unit"]
+      "categories": ["unit", "combination"]
     }
   ]
 }
diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -19,16 +19,19 @@ import { filterByCategory, filterByEvalName, useTextExtract } from "./args";
 import { generateExperimentName } from "./utils";
 import { exactMatch, errorMatch } from "./scoring";
 import { tasksByName, MODELS } from "./taskConfig";
-import { Eval } from "braintrust";
+import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
 import { EvalFunction, SummaryResult, Testcase } from "@/types/evals";
 import { EvalLogger } from "./logger";
-import { AvailableModel } from "@/dist";
+import { AvailableModel, LLMClient } from "@/dist";
 import { env } from "./env";
 import dotenv from "dotenv";
 import { StagehandEvalError } from "@/types/stagehandErrors";
 import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
 import OpenAI from "openai";
 import { initStagehand } from "./initStagehand";
+import { AISdkClient } from "@/examples/external_clients/aisdk";
+import { google } from "@ai-sdk/google";
+
 dotenv.config();
 
 /**
@@ -230,13 +233,20 @@ const generateFilteredTestcases = (): Testcase[] => {
           }
 
           // Execute the task
-          const llmClient = new CustomOpenAIClient({
+          let llmClient: LLMClient = new CustomOpenAIClient({
             modelName: input.modelName as AvailableModel,
-            client: new OpenAI({
-              apiKey: process.env.BRAINTRUST_API_KEY,
-              baseURL: "https://api.braintrust.dev/v1/proxy",
-            }),
+            client: wrapOpenAI(
+              new OpenAI({
+                apiKey: process.env.BRAINTRUST_API_KEY,
+                baseURL: "https://api.braintrust.dev/v1/proxy",
+              }),
+            ),
           });
+          if (input.modelName.startsWith("gemini")) {
+            llmClient = new AISdkClient({
+              model: wrapAISDKModel(google("gemini-2.0-flash")),
+            });
+          }
           const taskInput = await initStagehand({
             logger,
             llmClient,
diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts
@@ -40,6 +40,7 @@ const StagehandConfig = {
   headless: false,
   enableCaching,
   domSettleTimeoutMs: 30_000,
+  disablePino: true,
 };
 
 /**
@@ -74,7 +75,7 @@ export const initStagehand = async ({
     ...(domSettleTimeoutMs && { domSettleTimeoutMs }),
     actTimeoutMs,
     ...configOverrides,
-    logger: logger.log,
+    logger: logger.log.bind(logger),
   };
 
   const stagehand = new Stagehand(config);
diff --git a/evals/logger.ts b/evals/logger.ts
@@ -23,18 +23,27 @@ import { Stagehand, LogLine } from "@/dist";
  */
 function parseLogLine(logLine: LogLine): LogLineEval {
   try {
+    let parsedAuxiliary: Record<string, unknown> | undefined;
+
+    if (logLine.auxiliary) {
+      parsedAuxiliary = {};
+
+      for (const [key, entry] of Object.entries(logLine.auxiliary)) {
+        try {
+          parsedAuxiliary[key] =
+            entry.type === "object" ? JSON.parse(entry.value) : entry.value;
+        } catch (parseError) {
+          console.warn(`Failed to parse auxiliary entry ${key}:`, parseError);
+          // If parsing fails, use the raw value
+          parsedAuxiliary[key] = entry.value;
+        }
+      }
+    }
+
     return {
       ...logLine,
-      // Remove the original auxiliary field in favor of parsedAuxiliary
       auxiliary: undefined,
-      parsedAuxiliary: logLine.auxiliary
-        ? Object.fromEntries(
-            Object.entries(logLine.auxiliary).map(([key, entry]) => [
-              key,
-              entry.type === "object" ? JSON.parse(entry.value) : entry.value,
-            ]),
-          )
-        : undefined,
+      parsedAuxiliary,
     } as LogLineEval;
   } catch (e) {
     console.log("Error parsing log line", logLine);
@@ -56,10 +65,12 @@ function parseLogLine(logLine: LogLine): LogLineEval {
  *   included in evaluation output.
  */
 export class EvalLogger {
-  logs: LogLineEval[] = [];
+  private logs: LogLineEval[] = [];
   stagehand?: Stagehand;
 
-  constructor() {}
+  constructor() {
+    this.logs = [];
+  }
 
   /**
    * init:
@@ -78,9 +89,7 @@ export class EvalLogger {
    */
   log(logLine: LogLine) {
     console.log(logLineToString(logLine));
-    if (this.logs) {
-      this.logs.push(parseLogLine(logLine));
-    }
+    this.logs.push(parseLogLine(logLine));
   }
 
   /**
@@ -108,7 +117,7 @@ export class EvalLogger {
    * Retrieves the array of stored log lines.
    * Useful for returning logs after a task completes, for analysis or debugging.
    */
-  getLogs() {
-    return this.logs;
+  getLogs(): LogLineEval[] {
+    return this.logs || [];
   }
 }
diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts
@@ -12,9 +12,8 @@
 
 import fs from "fs";
 import path from "path";
-import { AvailableModel, AvailableModelSchema } from "@/dist";
+import { AvailableModel } from "@/dist";
 import { filterByEvalName } from "./args";
-import { UnsupportedModelError } from "@/types/stagehandErrors";
 
 // The configuration file `evals.config.json` contains a list of tasks and their associated categories.
 const configPath = path.join(__dirname, "evals.config.json");
@@ -50,7 +49,14 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) {
  */
 const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
   ? process.env.EVAL_MODELS.split(",")
-  : ["gpt-4o"];
+  : [
+      "gemini-2.0-flash",
+      //   "claude-3-5-sonnet-20240620",
+      //   "gpt-4o-mini",
+      //   "gpt-4o",
+      //   "llama-3.3-70b-versatile",
+      //   "llama3.3-70b",
+    ];
 
 /**
  * getModelList:
@@ -62,9 +68,6 @@ const getModelList = (): string[] => {
   return DEFAULT_EVAL_MODELS;
 };
 const MODELS: AvailableModel[] = getModelList().map((model) => {
-  if (!AvailableModelSchema.safeParse(model).success) {
-    throw new UnsupportedModelError(getModelList(), "Running evals");
-  }
   return model as AvailableModel;
 });
 
diff --git a/evals/tasks/allrecipes.ts b/evals/tasks/allrecipes.ts
@@ -16,7 +16,7 @@ export const allrecipes: EvalFunction = async ({
     action: 'Type "chocolate chip cookies" in the search bar',
   });
   await stagehand.page.act({
-    action: "hit enter",
+    action: "click the search icon button",
   });
 
   const recipeDetails = await stagehand.page.extract({
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`},`
`7`	`7`	`{`
`8`	`8`	`"name": "expect_act_timeout",`
`9`		`- "categories": ["unit"]`
	`9`	`+ "categories": ["unit", "combination"]`
`10`	`10`	`},`
`11`	`11`	`{`
`12`	`12`	`"name": "extract_repo_name",`
`@@ -18,7 +18,7 @@`
`18`	`18`	`},`
`19`	`19`	`{`
`20`	`20`	`"name": "instructions",`
`21`		`- "categories": ["unit"]`
	`21`	`+ "categories": ["unit", "combination"]`
`22`	`22`	`},`
`23`	`23`	`{`
`24`	`24`	`"name": "bidnet",`
`@@ -71,7 +71,7 @@`
`71`	`71`	`},`
`72`	`72`	`{`
`73`	`73`	`"name": "peeler_complex",`
`74`		`- "categories": ["unit"]`
	`74`	`+ "categories": ["unit", "combination"]`
`75`	`75`	`},`
`76`	`76`	`{`
`77`	`77`	`"name": "sciquest",`
`@@ -257,11 +257,11 @@`
`257`	`257`	`},`
`258`	`258`	`{`
`259`	`259`	`"name": "observe_iframes1",`
`260`		`- "categories": ["unit"]`
	`260`	`+ "categories": ["unit", "combination"]`
`261`	`261`	`},`
`262`	`262`	`{`
`263`	`263`	`"name": "observe_iframes2",`
`264`		`- "categories": ["unit"]`
	`264`	`+ "categories": ["unit", "combination"]`
`265`	`265`	`},`
`266`	`266`	`{`
`267`	`267`	`"name": "extract_hamilton_weather",`
`@@ -293,19 +293,19 @@`
`293`	`293`	`},`
`294`	`294`	`{`
`295`	`295`	`"name": "scroll_50",`
`296`		`- "categories": ["unit"]`
	`296`	`+ "categories": ["unit", "combination"]`
`297`	`297`	`},`
`298`	`298`	`{`
`299`	`299`	`"name": "scroll_75",`
`300`		`- "categories": ["unit"]`
	`300`	`+ "categories": ["unit", "combination"]`
`301`	`301`	`},`
`302`	`302`	`{`
`303`	`303`	`"name": "nextChunk",`
`304`		`- "categories": ["unit"]`
	`304`	`+ "categories": ["unit", "combination"]`
`305`	`305`	`},`
`306`	`306`	`{`
`307`	`307`	`"name": "prevChunk",`
`308`		`- "categories": ["unit"]`
	`308`	`+ "categories": ["unit", "combination"]`
`309`	`309`	`}`
`310`	`310`	`]`
`311`	`311`	`}`