From a400a7d9f2c2ddf768faa5efc0c4846d8682bd1f Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Fri, 4 Apr 2025 18:28:11 -0700 Subject: [PATCH 01/25] temp --- evals/args.ts | 2 + evals/evals.config.json | 28 ++++---- evals/index.eval.ts | 44 +++++++------ evals/initStagehand.ts | 64 +++++++------------ evals/{tasks => llm_clients}/hn_aisdk.ts | 21 +++--- .../{tasks => llm_clients}/hn_customOpenAI.ts | 15 +++-- evals/{tasks => llm_clients}/hn_langchain.ts | 16 +++-- evals/logger.ts | 4 +- evals/taskConfig.ts | 2 +- evals/tasks/allrecipes.ts | 13 +--- evals/tasks/amazon_add_to_cart.ts | 14 +--- evals/tasks/apple.ts | 15 ++--- evals/tasks/arxiv.ts | 14 +--- evals/tasks/bidnet.ts | 15 ++--- evals/tasks/combination_sauce.ts | 13 +--- evals/tasks/costar.ts | 12 +--- evals/tasks/expect_act_timeout.ts | 12 +--- evals/tasks/expect_act_timeout_global.ts | 32 ---------- evals/tasks/expedia.ts | 15 ++--- evals/tasks/expedia_search.ts | 16 ++--- evals/tasks/extract_aigrant_companies.ts | 14 +--- evals/tasks/extract_aigrant_targeted.ts | 14 +--- evals/tasks/extract_aigrant_targeted_2.ts | 14 +--- evals/tasks/extract_apartments.ts | 14 +--- evals/tasks/extract_area_codes.ts | 14 +--- evals/tasks/extract_baptist_health.ts | 13 +--- evals/tasks/extract_capacitor_info.ts | 15 +---- evals/tasks/extract_collaborators.ts | 13 +--- evals/tasks/extract_csa.ts | 17 ++--- evals/tasks/extract_geniusee.ts | 14 +--- evals/tasks/extract_geniusee_2.ts | 14 +--- evals/tasks/extract_github_commits.ts | 13 +--- evals/tasks/extract_github_stars.ts | 13 +--- evals/tasks/extract_hamilton_weather.ts | 13 +--- evals/tasks/extract_jstor_news.ts | 14 +--- evals/tasks/extract_memorial_healthcare.ts | 14 +--- evals/tasks/extract_nhl_stats.ts | 14 +--- evals/tasks/extract_partners.ts | 13 +--- evals/tasks/extract_press_releases.ts | 14 +--- evals/tasks/extract_professional_info.ts | 13 +--- evals/tasks/extract_public_notices.ts | 14 +--- evals/tasks/extract_recipe.ts | 13 +--- evals/tasks/extract_regulations_table.ts | 13 +--- evals/tasks/extract_repo_name.ts | 12 +--- evals/tasks/extract_resistor_info.ts | 13 +--- evals/tasks/extract_rockauto.ts | 15 +---- .../tasks/extract_snowshoeing_destinations.ts | 13 +--- evals/tasks/extract_staff_members.ts | 14 +--- evals/tasks/extract_zillow.ts | 14 +--- evals/tasks/google_jobs.ts | 13 +--- evals/tasks/history.ts | 15 ++--- evals/tasks/homedepot.ts | 14 +--- evals/tasks/imdb_movie_details.ts | 13 +--- evals/tasks/instructions.ts | 25 +++----- evals/tasks/ionwave.ts | 15 ++--- evals/tasks/ionwave_observe.ts | 15 ++--- evals/tasks/nextChunk.ts | 16 +++-- evals/tasks/nonsense_action.ts | 15 ++--- evals/tasks/observe_amazon_add_to_cart.ts | 12 +--- evals/tasks/observe_github.ts | 15 ++--- evals/tasks/observe_iframes1.ts | 15 ++--- evals/tasks/observe_iframes2.ts | 16 +++-- evals/tasks/observe_simple_google_search.ts | 15 +---- evals/tasks/observe_taxes.ts | 15 ++--- evals/tasks/observe_vantechjournal.ts | 12 +--- evals/tasks/observe_yc_startup.ts | 12 +--- evals/tasks/panamcs.ts | 15 ++--- evals/tasks/peeler_complex.ts | 13 +--- evals/tasks/peeler_simple.ts | 15 ++--- evals/tasks/prevChunk.ts | 15 +++-- evals/tasks/rakuten_jp.ts | 15 ++--- evals/tasks/sciquest.ts | 13 +--- evals/tasks/scroll_50.ts | 15 ++--- evals/tasks/scroll_75.ts | 15 +++-- evals/tasks/shopify_homepage.ts | 15 ++--- evals/tasks/simple_google_search.ts | 12 +--- evals/tasks/stock_x.ts | 15 ++--- evals/tasks/ted_talk.ts | 13 +--- evals/tasks/vanta.ts | 15 ++--- evals/tasks/vanta_h.ts | 15 ++--- evals/tasks/vantechjournal.ts | 15 ++--- evals/tasks/wichita.ts | 13 +--- evals/tasks/wikipedia.ts | 15 ++--- examples/external_clients/customOpenAI.ts | 23 +++++++ lib/StagehandPage.ts | 8 +-- stagehand.config.ts | 2 +- types/evals.ts | 11 ++-- types/stagehandErrors.ts | 10 +-- 88 files changed, 451 insertions(+), 865 deletions(-) rename evals/{tasks => llm_clients}/hn_aisdk.ts (91%) rename evals/{tasks => llm_clients}/hn_customOpenAI.ts (91%) rename evals/{tasks => llm_clients}/hn_langchain.ts (91%) delete mode 100644 evals/tasks/expect_act_timeout_global.ts diff --git a/evals/args.ts b/evals/args.ts index d09d50f2c..ea06b6d47 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -66,6 +66,8 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES "regression_llm_providers", "regression_text_extract", "regression_dom_extract", + "llm_clients", + "unit", ]; // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name diff --git a/evals/evals.config.json b/evals/evals.config.json index 1c822c851..9ad4fd8f5 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -6,11 +6,7 @@ }, { "name": "expect_act_timeout", - "categories": ["act"] - }, - { - "name": "expect_act_timeout_global", - "categories": ["act"] + "categories": ["unit"] }, { "name": "extract_repo_name", @@ -22,7 +18,7 @@ }, { "name": "instructions", - "categories": ["combination"] + "categories": ["unit"] }, { "name": "bidnet", @@ -75,7 +71,7 @@ }, { "name": "peeler_complex", - "categories": ["combination"] + "categories": ["unit"] }, { "name": "sciquest", @@ -87,15 +83,15 @@ }, { "name": "hn_aisdk", - "categories": ["regression_llm_providers"] + "categories": ["llm_clients"] }, { "name": "hn_langchain", - "categories": ["regression_llm_providers"] + "categories": ["llm_clients"] }, { "name": "hn_customOpenAI", - "categories": ["regression_llm_providers"] + "categories": ["llm_clients"] }, { "name": "apple", @@ -261,11 +257,11 @@ }, { "name": "observe_iframes1", - "categories": ["observe"] + "categories": ["unit"] }, { "name": "observe_iframes2", - "categories": ["observe"] + "categories": ["unit"] }, { "name": "extract_hamilton_weather", @@ -297,19 +293,19 @@ }, { "name": "scroll_50", - "categories": ["act"] + "categories": ["unit"] }, { "name": "scroll_75", - "categories": ["act", "regression_dom_extract"] + "categories": ["unit"] }, { "name": "nextChunk", - "categories": ["act"] + "categories": ["unit"] }, { "name": "prevChunk", - "categories": ["act"] + "categories": ["unit"] } ] } diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 008ebbcdc..e429c4b6e 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -1,6 +1,6 @@ /** * This script orchestrates the running of evaluations against a set of tasks. - * It braintrust to run multiple testcases (each testcase representing a + * It uses Braintrust to run multiple testcases (each testcase representing a * given task-model combination) and then aggregates the results, producing * a summary of passes, failures, and categorized success rates. * @@ -15,13 +15,7 @@ import fs from "fs"; import path from "path"; import process from "process"; -import { - filterByCategory, - filterByEvalName, - useTextExtract, - useAccessibilityTree, -} from "./args"; - +import { filterByCategory, filterByEvalName, useTextExtract } from "./args"; import { generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; import { tasksByName, MODELS } from "./taskConfig"; @@ -32,6 +26,9 @@ import { AvailableModel } from "@/dist"; import { env } from "./env"; import dotenv from "dotenv"; import { StagehandEvalError } from "@/types/stagehandErrors"; +import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; +import OpenAI from "openai"; +import { initStagehand } from "./initStagehand"; dotenv.config(); /** @@ -233,18 +230,29 @@ const generateFilteredTestcases = (): Testcase[] => { } // Execute the task - const result = await taskFunction({ - modelName: input.modelName, + const llmClient = new CustomOpenAIClient({ + modelName: input.modelName as AvailableModel, + client: new OpenAI({ + apiKey: process.env.BRAINTRUST_API_KEY, + baseURL: "https://api.braintrust.dev/v1/proxy", + }), + }); + const taskInput = await initStagehand({ logger, + llmClient, useTextExtract, - useAccessibilityTree, }); - - // Log result to console - if (result && result._success) { - console.log(`✅ ${input.name}: Passed`); - } else { - console.log(`❌ ${input.name}: Failed`); + let result; + try { + result = await taskFunction(taskInput); + // Log result to console + if (result && result._success) { + console.log(`✅ ${input.name}: Passed`); + } else { + console.log(`❌ ${input.name}: Failed`); + } + } finally { + await taskInput.stagehand.close(); } return result; } catch (error) { @@ -256,7 +264,7 @@ const generateFilteredTestcases = (): Testcase[] => { auxiliary: { error: { value: error.message, - type: "object", + type: "string", }, trace: { value: error.stack, diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts index 86e60d202..7769bb6a0 100644 --- a/evals/initStagehand.ts +++ b/evals/initStagehand.ts @@ -11,15 +11,17 @@ */ import { enableCaching, env } from "./env"; -import { - AvailableModel, - ConstructorParams, - LLMClient, - LogLine, - Stagehand, -} from "@/dist"; +import { ConstructorParams, LLMClient, Stagehand } from "@/dist"; import { EvalLogger } from "./logger"; -import { StagehandEvalError } from "@/types/stagehandErrors"; + +export type StagehandInitResult = { + stagehand: Stagehand; + logger: EvalLogger; + debugUrl: string; + sessionUrl: string; + useTextExtract: boolean; + stagehandConfig: ConstructorParams; +}; /** * StagehandConfig: @@ -38,12 +40,6 @@ const StagehandConfig = { headless: false, enableCaching, domSettleTimeoutMs: 30_000, - modelName: "gpt-4o", // default model, can be overridden by initStagehand arguments - modelClientOptions: { - apiKey: process.env.OPENAI_API_KEY, - }, - logger: (logLine: LogLine) => - console.log(`[stagehand::${logLine.category}] ${logLine.message}`), }; /** @@ -59,45 +55,26 @@ const StagehandConfig = { */ export const initStagehand = async ({ llmClient, - modelName, domSettleTimeoutMs, logger, configOverrides, actTimeoutMs, + useTextExtract, }: { - llmClient?: LLMClient; - modelName?: AvailableModel; + llmClient: LLMClient; domSettleTimeoutMs?: number; logger: EvalLogger; configOverrides?: Partial; actTimeoutMs?: number; -}) => { - if (llmClient && modelName) { - throw new StagehandEvalError("Cannot provide both llmClient and modelName"); - } - - if (!llmClient && !modelName) { - throw new StagehandEvalError("Must provide either llmClient or modelName"); - } - - let chosenApiKey: string | undefined = process.env.OPENAI_API_KEY; - if (modelName?.startsWith("claude")) { - chosenApiKey = process.env.ANTHROPIC_API_KEY; - } - + useTextExtract?: boolean; +}): Promise => { const config = { ...StagehandConfig, - modelName, llmClient, ...(domSettleTimeoutMs && { domSettleTimeoutMs }), - modelClientOptions: { - apiKey: chosenApiKey, - }, actTimeoutMs, - logger: (logLine: LogLine) => { - logger.log(logLine); - }, ...configOverrides, + logger: logger.log, }; const stagehand = new Stagehand(config); @@ -105,6 +82,13 @@ export const initStagehand = async ({ // Associate the logger with the Stagehand instance logger.init(stagehand); - const initResponse = await stagehand.init(); - return { stagehand, logger, initResponse }; + const { debugUrl, sessionUrl } = await stagehand.init(); + return { + stagehand, + stagehandConfig: config, + logger, + debugUrl, + sessionUrl, + useTextExtract, + }; }; diff --git a/evals/tasks/hn_aisdk.ts b/evals/llm_clients/hn_aisdk.ts similarity index 91% rename from evals/tasks/hn_aisdk.ts rename to evals/llm_clients/hn_aisdk.ts index 02a17736a..51ab4937e 100644 --- a/evals/tasks/hn_aisdk.ts +++ b/evals/llm_clients/hn_aisdk.ts @@ -1,19 +1,22 @@ +import { Stagehand } from "@/dist"; +import { AISdkClient } from "@/examples/external_clients/aisdk"; import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; -import { z } from "zod"; import { openai } from "@ai-sdk/openai/dist"; -import { AISdkClient } from "@/examples/external_clients/aisdk"; +import { z } from "zod"; -export const hn_aisdk: EvalFunction = async ({ logger }) => { - const { stagehand, initResponse } = await initStagehand({ - logger, +export const hn_aisdk: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehandConfig, + logger, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, llmClient: new AISdkClient({ model: openai("gpt-4o-mini"), }), }); - - const { debugUrl, sessionUrl } = initResponse; - + await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hackernews/", ); diff --git a/evals/tasks/hn_customOpenAI.ts b/evals/llm_clients/hn_customOpenAI.ts similarity index 91% rename from evals/tasks/hn_customOpenAI.ts rename to evals/llm_clients/hn_customOpenAI.ts index 94f85c879..7b4f1f1e5 100644 --- a/evals/tasks/hn_customOpenAI.ts +++ b/evals/llm_clients/hn_customOpenAI.ts @@ -1,12 +1,17 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; import OpenAI from "openai"; +import { Stagehand } from "@/dist"; -export const hn_customOpenAI: EvalFunction = async ({ logger }) => { - const { stagehand, initResponse } = await initStagehand({ - logger, +export const hn_customOpenAI: EvalFunction = async ({ + logger, + stagehandConfig, + debugUrl, + sessionUrl, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, llmClient: new CustomOpenAIClient({ modelName: "gpt-4o-mini", client: new OpenAI({ @@ -15,7 +20,7 @@ export const hn_customOpenAI: EvalFunction = async ({ logger }) => { }), }); - const { debugUrl, sessionUrl } = initResponse; + await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hackernews/", diff --git a/evals/tasks/hn_langchain.ts b/evals/llm_clients/hn_langchain.ts similarity index 91% rename from evals/tasks/hn_langchain.ts rename to evals/llm_clients/hn_langchain.ts index dd16efdad..35fc6b68d 100644 --- a/evals/tasks/hn_langchain.ts +++ b/evals/llm_clients/hn_langchain.ts @@ -1,20 +1,24 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; import { LangchainClient } from "@/examples/external_clients/langchain"; import { ChatOpenAI } from "@langchain/openai"; +import { Stagehand } from "@/dist"; -export const hn_langchain: EvalFunction = async ({ logger }) => { - const { stagehand, initResponse } = await initStagehand({ - logger, +export const hn_langchain: EvalFunction = async ({ + logger, + stagehandConfig, + debugUrl, + sessionUrl, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, llmClient: new LangchainClient( new ChatOpenAI({ model: "gpt-4o-mini", }), ), }); - - const { debugUrl, sessionUrl } = initResponse; + await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hackernews/", diff --git a/evals/logger.ts b/evals/logger.ts index 078f9eabf..d07ddfcbe 100644 --- a/evals/logger.ts +++ b/evals/logger.ts @@ -78,7 +78,9 @@ export class EvalLogger { */ log(logLine: LogLine) { console.log(logLineToString(logLine)); - this.logs.push(parseLogLine(logLine)); + if (this.logs) { + this.logs.push(parseLogLine(logLine)); + } } /** diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index 590467b5c..03dff1a24 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -50,7 +50,7 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") - : ["gpt-4o", "claude-3-5-sonnet-latest"]; + : ["gpt-4o"]; /** * getModelList: diff --git a/evals/tasks/allrecipes.ts b/evals/tasks/allrecipes.ts index 6da2bf8dd..5f515b48d 100644 --- a/evals/tasks/allrecipes.ts +++ b/evals/tasks/allrecipes.ts @@ -1,19 +1,13 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; import { z } from "zod"; export const allrecipes: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.allrecipes.com/", { waitUntil: "domcontentloaded", }); @@ -34,7 +28,6 @@ export const allrecipes: EvalFunction = async ({ .string() .describe("Total number of ratings for the recipe"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/amazon_add_to_cart.ts b/evals/tasks/amazon_add_to_cart.ts index e7ba19c82..6272ae054 100644 --- a/evals/tasks/amazon_add_to_cart.ts +++ b/evals/tasks/amazon_add_to_cart.ts @@ -1,17 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; export const amazon_add_to_cart: EvalFunction = async ({ - modelName, logger, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://www.amazon.com/Laptop-MacBook-Surface-Water-Resistant-Accessories/dp/B0D5M4H5CD", ); @@ -32,8 +26,6 @@ export const amazon_add_to_cart: EvalFunction = async ({ const currentUrl = stagehand.page.url(); const expectedUrlPrefix = "https://www.amazon.com/ap/signin"; - await stagehand.close(); - return { _success: currentUrl.startsWith(expectedUrlPrefix), currentUrl, diff --git a/evals/tasks/apple.ts b/evals/tasks/apple.ts index 7cbcdc747..1926a1718 100644 --- a/evals/tasks/apple.ts +++ b/evals/tasks/apple.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const apple: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const apple: EvalFunction = async ({ + logger, + debugUrl, + sessionUrl, + stagehand, +}) => { await stagehand.page.goto("https://www.apple.com/iphone-16-pro/"); await stagehand.page.act({ action: "click on the buy button" }); diff --git a/evals/tasks/arxiv.ts b/evals/tasks/arxiv.ts index c3edcf8cf..919d28c7f 100644 --- a/evals/tasks/arxiv.ts +++ b/evals/tasks/arxiv.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const arxiv: EvalFunction = async ({ - modelName, logger, + debugUrl, + sessionUrl, + stagehand, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://arxiv.org/search/"); @@ -35,7 +29,6 @@ export const arxiv: EvalFunction = async ({ ) .describe("list of papers"), }), - modelName, useTextExtract, }); @@ -93,7 +86,6 @@ export const arxiv: EvalFunction = async ({ ) .nullable(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/bidnet.ts b/evals/tasks/bidnet.ts index f7ebe56a7..d73c83030 100644 --- a/evals/tasks/bidnet.ts +++ b/evals/tasks/bidnet.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const bidnet: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const bidnet: EvalFunction = async ({ + logger, + debugUrl, + sessionUrl, + stagehand, +}) => { await stagehand.page.goto("https://www.bidnetdirect.com/"); await stagehand.page.act({ diff --git a/evals/tasks/combination_sauce.ts b/evals/tasks/combination_sauce.ts index c5573675e..fc8bfc32e 100644 --- a/evals/tasks/combination_sauce.ts +++ b/evals/tasks/combination_sauce.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const combination_sauce: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://www.saucedemo.com/"); @@ -23,7 +17,6 @@ export const combination_sauce: EvalFunction = async ({ usernames: z.array(z.string()).describe("the accepted usernames"), password: z.string().describe("the password for login"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/costar.ts b/evals/tasks/costar.ts index 48bc8dd26..221919e4e 100644 --- a/evals/tasks/costar.ts +++ b/evals/tasks/costar.ts @@ -1,18 +1,13 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; import { z } from "zod"; export const costar: EvalFunction = async ({ - modelName, logger, + debugUrl, + sessionUrl, + stagehand, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; // TODO: fix this eval - does not work in headless mode try { await Promise.race([ @@ -33,7 +28,6 @@ export const costar: EvalFunction = async ({ schema: z.object({ title: z.string().describe("the title of the article").nullable(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/expect_act_timeout.ts b/evals/tasks/expect_act_timeout.ts index 821be2ef4..0a6f45170 100644 --- a/evals/tasks/expect_act_timeout.ts +++ b/evals/tasks/expect_act_timeout.ts @@ -1,17 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const expect_act_timeout: EvalFunction = async ({ - modelName, logger, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://docs.stagehand.dev"); const result = await stagehand.page.act({ action: "search for 'Stagehand'", diff --git a/evals/tasks/expect_act_timeout_global.ts b/evals/tasks/expect_act_timeout_global.ts deleted file mode 100644 index ab39cd904..000000000 --- a/evals/tasks/expect_act_timeout_global.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { initStagehand } from "@/evals/initStagehand"; -import { EvalFunction } from "@/types/evals"; - -export const expect_act_timeout_global: EvalFunction = async ({ - modelName, - logger, -}) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - actTimeoutMs: 1_000, - }); - - const { debugUrl, sessionUrl } = initResponse; - - await stagehand.page.goto("https://docs.stagehand.dev"); - - const result = await stagehand.page.act({ - action: "search for 'Stagehand'", - timeoutMs: 1_000, - }); - console.log("RESULT", result); - - await stagehand.close(); - - return { - _success: !result.success, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; -}; diff --git a/evals/tasks/expedia.ts b/evals/tasks/expedia.ts index 192456e90..ac5645a5d 100644 --- a/evals/tasks/expedia.ts +++ b/evals/tasks/expedia.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const expedia: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const expedia: EvalFunction = async ({ + logger, + debugUrl, + sessionUrl, + stagehand, +}) => { try { await stagehand.page.goto("https://www.expedia.com/flights"); await stagehand.page.act( diff --git a/evals/tasks/expedia_search.ts b/evals/tasks/expedia_search.ts index 21f0feed9..592e553dd 100644 --- a/evals/tasks/expedia_search.ts +++ b/evals/tasks/expedia_search.ts @@ -1,17 +1,13 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const expedia_search: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const expedia_search: EvalFunction = async ({ + logger, + debugUrl, + sessionUrl, + stagehand, +}) => { try { await stagehand.page.goto("https://www.expedia.com/flights"); - await stagehand.page.act({ action: "find round-trip flights from San Francisco (SFO) to Toronto (YYZ) for Jan 1, 2025 (up to one to two weeks)", diff --git a/evals/tasks/extract_aigrant_companies.ts b/evals/tasks/extract_aigrant_companies.ts index 7fb86636b..f3e1f19cb 100644 --- a/evals/tasks/extract_aigrant_companies.ts +++ b/evals/tasks/extract_aigrant_companies.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_aigrant_companies: EvalFunction = async ({ - modelName, logger, + debugUrl, + sessionUrl, + stagehand, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); @@ -29,7 +22,6 @@ export const extract_aigrant_companies: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_aigrant_targeted.ts b/evals/tasks/extract_aigrant_targeted.ts index 626d0b6ad..d72328505 100644 --- a/evals/tasks/extract_aigrant_targeted.ts +++ b/evals/tasks/extract_aigrant_targeted.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_aigrant_targeted: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); @@ -24,7 +17,6 @@ export const extract_aigrant_targeted: EvalFunction = async ({ schema: z.object({ company_name: z.string(), }), - modelName, useTextExtract, selector: selector, }); diff --git a/evals/tasks/extract_aigrant_targeted_2.ts b/evals/tasks/extract_aigrant_targeted_2.ts index 3be8ef1ee..772d2f4a4 100644 --- a/evals/tasks/extract_aigrant_targeted_2.ts +++ b/evals/tasks/extract_aigrant_targeted_2.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_aigrant_targeted_2: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); @@ -24,7 +17,6 @@ export const extract_aigrant_targeted_2: EvalFunction = async ({ schema: z.object({ company_name: z.string(), }), - modelName, useTextExtract, selector: selector, }); diff --git a/evals/tasks/extract_apartments.ts b/evals/tasks/extract_apartments.ts index dee0b5daf..96cfd06b7 100644 --- a/evals/tasks/extract_apartments.ts +++ b/evals/tasks/extract_apartments.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "../initStagehand"; import { EvalFunction } from "../../types/evals"; export const extract_apartments: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://www.apartments.com/san-francisco-ca/2-bedrooms/", ); @@ -29,7 +22,6 @@ export const extract_apartments: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_area_codes.ts b/evals/tasks/extract_area_codes.ts index 5a5094b79..07f99ec32 100644 --- a/evals/tasks/extract_area_codes.ts +++ b/evals/tasks/extract_area_codes.ts @@ -1,20 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_area_codes: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - - await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/ncc-area-codes/", { waitUntil: "domcontentloaded" }, @@ -44,7 +37,6 @@ export const extract_area_codes: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_baptist_health.ts b/evals/tasks/extract_baptist_health.ts index 2fde75b30..1bb90d83e 100644 --- a/evals/tasks/extract_baptist_health.ts +++ b/evals/tasks/extract_baptist_health.ts @@ -1,20 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { compareStrings } from "@/evals/utils"; import { z } from "zod"; export const extract_baptist_health: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/baptist-health/", ); @@ -27,7 +21,6 @@ export const extract_baptist_health: EvalFunction = async ({ phone: z.string(), fax: z.string(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_capacitor_info.ts b/evals/tasks/extract_capacitor_info.ts index 97963428a..f416b6a75 100644 --- a/evals/tasks/extract_capacitor_info.ts +++ b/evals/tasks/extract_capacitor_info.ts @@ -1,20 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { normalizeString } from "@/evals/utils"; import { z } from "zod"; export const extract_capacitor_info: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/capacitor/", ); @@ -26,12 +20,9 @@ export const extract_capacitor_info: EvalFunction = async ({ RoHS_Status: z.string(), Impedance: z.string(), }), - modelName, useTextExtract, }); - await stagehand.close(); - const { ECCN_code, RoHS_Status, Impedance } = result; const expected = { diff --git a/evals/tasks/extract_collaborators.ts b/evals/tasks/extract_collaborators.ts index 2fda50bf0..b0da6f37a 100644 --- a/evals/tasks/extract_collaborators.ts +++ b/evals/tasks/extract_collaborators.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_collaborators: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://github.com/facebook/react"); await stagehand.page.act({ @@ -32,7 +26,6 @@ export const extract_collaborators: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_csa.ts b/evals/tasks/extract_csa.ts index 5ce1f58a6..9609d1d5d 100644 --- a/evals/tasks/extract_csa.ts +++ b/evals/tasks/extract_csa.ts @@ -1,20 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_csa: EvalFunction = async ({ - modelName, - logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, + logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - - const { page } = stagehand; + const page = stagehand.page; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/csa/", ); @@ -32,7 +26,6 @@ export const extract_csa: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_geniusee.ts b/evals/tasks/extract_geniusee.ts index 9aa3045a0..18664ad6d 100644 --- a/evals/tasks/extract_geniusee.ts +++ b/evals/tasks/extract_geniusee.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_geniusee: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/", ); @@ -25,7 +18,6 @@ export const extract_geniusee: EvalFunction = async ({ schema: z.object({ scalability: z.string(), }), - modelName, useTextExtract, selector: selector, }); diff --git a/evals/tasks/extract_geniusee_2.ts b/evals/tasks/extract_geniusee_2.ts index 526bf3cab..4a3f8006b 100644 --- a/evals/tasks/extract_geniusee_2.ts +++ b/evals/tasks/extract_geniusee_2.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_geniusee_2: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/", ); @@ -25,7 +18,6 @@ export const extract_geniusee_2: EvalFunction = async ({ schema: z.object({ scalability: z.string(), }), - modelName, useTextExtract, selector: selector, }); diff --git a/evals/tasks/extract_github_commits.ts b/evals/tasks/extract_github_commits.ts index 79e53f019..4d8256345 100644 --- a/evals/tasks/extract_github_commits.ts +++ b/evals/tasks/extract_github_commits.ts @@ -1,19 +1,13 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; import { z } from "zod"; export const extract_github_commits: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://github.com/facebook/react"); @@ -32,7 +26,6 @@ export const extract_github_commits: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_github_stars.ts b/evals/tasks/extract_github_stars.ts index 3d6188a31..7ab994076 100644 --- a/evals/tasks/extract_github_stars.ts +++ b/evals/tasks/extract_github_stars.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_github_stars: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://github.com/facebook/react"); @@ -22,7 +16,6 @@ export const extract_github_stars: EvalFunction = async ({ schema: z.object({ stars: z.number().describe("the number of stars for the project"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_hamilton_weather.ts b/evals/tasks/extract_hamilton_weather.ts index be6388bd8..683c14c65 100644 --- a/evals/tasks/extract_hamilton_weather.ts +++ b/evals/tasks/extract_hamilton_weather.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_hamilton_weather: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hamilton-weather/", @@ -31,7 +25,6 @@ export const extract_hamilton_weather: EvalFunction = async ({ barometer: z.string(), visibility: z.string(), }), - modelName, useTextExtract, selector: xpath, }); diff --git a/evals/tasks/extract_jstor_news.ts b/evals/tasks/extract_jstor_news.ts index 88530bcd0..4c5c690c9 100644 --- a/evals/tasks/extract_jstor_news.ts +++ b/evals/tasks/extract_jstor_news.ts @@ -1,20 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_jstor_news: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - - await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/jstor/", { @@ -37,7 +30,6 @@ export const extract_jstor_news: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_memorial_healthcare.ts b/evals/tasks/extract_memorial_healthcare.ts index 706a18a49..c01dd5480 100644 --- a/evals/tasks/extract_memorial_healthcare.ts +++ b/evals/tasks/extract_memorial_healthcare.ts @@ -1,21 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; import { compareStrings } from "@/evals/utils"; export const extract_memorial_healthcare: EvalFunction = async ({ - modelName, logger, useTextExtract, + debugUrl, + sessionUrl, + stagehand, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/mycmh/", ); @@ -32,7 +25,6 @@ export const extract_memorial_healthcare: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_nhl_stats.ts b/evals/tasks/extract_nhl_stats.ts index 91e5690b9..fb9e982ac 100644 --- a/evals/tasks/extract_nhl_stats.ts +++ b/evals/tasks/extract_nhl_stats.ts @@ -1,21 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { normalizeString } from "@/evals/utils"; import { z } from "zod"; export const extract_nhl_stats: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 4000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://www.hockeydb.com/ihdb/stats/top_league.php?lid=nhl1927&sid=1990", { @@ -31,7 +24,6 @@ export const extract_nhl_stats: EvalFunction = async ({ num_goals: z.string(), team: z.string(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_partners.ts b/evals/tasks/extract_partners.ts index 2a571d12b..e1851802f 100644 --- a/evals/tasks/extract_partners.ts +++ b/evals/tasks/extract_partners.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_partners: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://ramp.com"); @@ -44,7 +38,6 @@ export const extract_partners: EvalFunction = async ({ .optional() .describe("Any explanation about partner listing or absence thereof"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_press_releases.ts b/evals/tasks/extract_press_releases.ts index 85c58f826..b2fe815e3 100644 --- a/evals/tasks/extract_press_releases.ts +++ b/evals/tasks/extract_press_releases.ts @@ -1,21 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; import { compareStrings } from "@/evals/utils"; export const extract_press_releases: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - const schema = z.object({ items: z.array( z.object({ @@ -42,7 +35,6 @@ export const extract_press_releases: EvalFunction = async ({ instruction: "extract the title and corresponding publish date of EACH AND EVERY press releases on this page. DO NOT MISS ANY PRESS RELEASES.", schema, - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_professional_info.ts b/evals/tasks/extract_professional_info.ts index 551e71a4e..c77708f5c 100644 --- a/evals/tasks/extract_professional_info.ts +++ b/evals/tasks/extract_professional_info.ts @@ -1,20 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { normalizeString } from "@/evals/utils"; import { z } from "zod"; export const extract_professional_info: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/professional-info/", ); @@ -27,7 +21,6 @@ export const extract_professional_info: EvalFunction = async ({ phone: z.string(), fax: z.string(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_public_notices.ts b/evals/tasks/extract_public_notices.ts index 410bff7eb..73a76864c 100644 --- a/evals/tasks/extract_public_notices.ts +++ b/evals/tasks/extract_public_notices.ts @@ -1,21 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; import { compareStrings } from "@/evals/utils"; export const extract_public_notices: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - - await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/sars/", { waitUntil: "load" }, @@ -43,7 +36,6 @@ export const extract_public_notices: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_recipe.ts b/evals/tasks/extract_recipe.ts index 4fa126c71..1e5040688 100644 --- a/evals/tasks/extract_recipe.ts +++ b/evals/tasks/extract_recipe.ts @@ -1,19 +1,13 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; import { z } from "zod"; export const extract_recipe: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/allrecipes-extract/", { @@ -37,7 +31,6 @@ export const extract_recipe: EvalFunction = async ({ "the number of teaspoons of lemon juice needed for the mushroom pan sauce", ), }), - modelName, useTextExtract, selector: selector, }); diff --git a/evals/tasks/extract_regulations_table.ts b/evals/tasks/extract_regulations_table.ts index 9dc8880b1..5bd75abe3 100644 --- a/evals/tasks/extract_regulations_table.ts +++ b/evals/tasks/extract_regulations_table.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_regulations_table: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/ncc-numbering-plan/", @@ -35,7 +29,6 @@ export const extract_regulations_table: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, selector: xpath, }); diff --git a/evals/tasks/extract_repo_name.ts b/evals/tasks/extract_repo_name.ts index ec06576a5..ff4f48238 100644 --- a/evals/tasks/extract_repo_name.ts +++ b/evals/tasks/extract_repo_name.ts @@ -1,17 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; export const extract_repo_name: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://github.com/facebook/react"); diff --git a/evals/tasks/extract_resistor_info.ts b/evals/tasks/extract_resistor_info.ts index 13d2abc02..4f4f0b265 100644 --- a/evals/tasks/extract_resistor_info.ts +++ b/evals/tasks/extract_resistor_info.ts @@ -1,20 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { normalizeString } from "@/evals/utils"; import { z } from "zod"; export const extract_resistor_info: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/resistor/", ); @@ -28,7 +22,6 @@ export const extract_resistor_info: EvalFunction = async ({ resistance: z.string(), operating_temperature_range: z.string(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_rockauto.ts b/evals/tasks/extract_rockauto.ts index e439d20f5..f154790de 100644 --- a/evals/tasks/extract_rockauto.ts +++ b/evals/tasks/extract_rockauto.ts @@ -1,20 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const extract_rockauto: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 10000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/rockauto/", ); @@ -30,9 +23,7 @@ export const extract_rockauto: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, - domSettleTimeoutMs: 10000, }); await stagehand.close(); diff --git a/evals/tasks/extract_snowshoeing_destinations.ts b/evals/tasks/extract_snowshoeing_destinations.ts index 0fa09fcef..b35e7be64 100644 --- a/evals/tasks/extract_snowshoeing_destinations.ts +++ b/evals/tasks/extract_snowshoeing_destinations.ts @@ -1,19 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_snowshoeing_destinations: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto( "https://www.cbisland.com/blog/10-snowshoeing-adventures-on-cape-breton-island/", @@ -40,7 +34,6 @@ export const extract_snowshoeing_destinations: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_staff_members.ts b/evals/tasks/extract_staff_members.ts index ff177a467..7cf396773 100644 --- a/evals/tasks/extract_staff_members.ts +++ b/evals/tasks/extract_staff_members.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const extract_staff_members: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/panamcs/", ); @@ -30,7 +23,6 @@ export const extract_staff_members: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/extract_zillow.ts b/evals/tasks/extract_zillow.ts index 6df50bf73..1902f281f 100644 --- a/evals/tasks/extract_zillow.ts +++ b/evals/tasks/extract_zillow.ts @@ -1,20 +1,13 @@ import { z } from "zod"; -import { initStagehand } from "../initStagehand"; import { EvalFunction } from "../../types/evals"; export const extract_zillow: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/zillow/", ); @@ -31,7 +24,6 @@ export const extract_zillow: EvalFunction = async ({ }), ), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/google_jobs.ts b/evals/tasks/google_jobs.ts index 96e5eb62c..a8e139988 100644 --- a/evals/tasks/google_jobs.ts +++ b/evals/tasks/google_jobs.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const google_jobs: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://www.google.com/"); await stagehand.page.act("click on the about page"); @@ -46,7 +40,6 @@ export const google_jobs: EvalFunction = async ({ .nullable(), }), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/history.ts b/evals/tasks/history.ts index 6dbfcc1c5..a41eb256b 100644 --- a/evals/tasks/history.ts +++ b/evals/tasks/history.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const history: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const history: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://docs.stagehand.dev"); await stagehand.page.act("click on the 'Quickstart' tab"); diff --git a/evals/tasks/homedepot.ts b/evals/tasks/homedepot.ts index dff2828ed..ca4b39b85 100644 --- a/evals/tasks/homedepot.ts +++ b/evals/tasks/homedepot.ts @@ -1,20 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const homedepot: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 60_000, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto("https://www.homedepot.com/"); await stagehand.page.act("search for gas grills"); @@ -33,7 +26,6 @@ export const homedepot: EvalFunction = async ({ ) .describe("Gas grill Primary Burner BTU exact value"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/imdb_movie_details.ts b/evals/tasks/imdb_movie_details.ts index 78e30c2bd..6e45bd2a0 100644 --- a/evals/tasks/imdb_movie_details.ts +++ b/evals/tasks/imdb_movie_details.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const imdb_movie_details: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.imdb.com/title/tt0111161/", { waitUntil: "domcontentloaded", }); @@ -28,7 +22,6 @@ export const imdb_movie_details: EvalFunction = async ({ .array(z.string()) .describe("List of countries with the most ratings"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/instructions.ts b/evals/tasks/instructions.ts index ca4cd74df..1142ca862 100644 --- a/evals/tasks/instructions.ts +++ b/evals/tasks/instructions.ts @@ -1,18 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const instructions: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - configOverrides: { - systemPrompt: - "if the users says `secret12345`, click on the 'getting started' tab", - }, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const instructions: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { try { const page = stagehand.page; @@ -29,8 +22,6 @@ export const instructions: EvalFunction = async ({ modelName, logger }) => { const isCorrectUrl = url === "https://docs.browserbase.com/introduction/getting-started"; - await stagehand.close(); - return { _success: isCorrectUrl, debugUrl, @@ -40,8 +31,6 @@ export const instructions: EvalFunction = async ({ modelName, logger }) => { } catch (error) { console.error("Error or timeout occurred:", error); - await stagehand.close(); - return { _success: false, error: JSON.parse(JSON.stringify(error, null, 2)), @@ -49,5 +38,7 @@ export const instructions: EvalFunction = async ({ modelName, logger }) => { sessionUrl, logs: logger.getLogs(), }; + } finally { + await stagehand.close(); } }; diff --git a/evals/tasks/ionwave.ts b/evals/tasks/ionwave.ts index 4b7e0ea63..e4d4e200d 100644 --- a/evals/tasks/ionwave.ts +++ b/evals/tasks/ionwave.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const ionwave: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const ionwave: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://elpasotexas.ionwave.net/Login.aspx"); await stagehand.page.act({ diff --git a/evals/tasks/ionwave_observe.ts b/evals/tasks/ionwave_observe.ts index b0433342f..8fa3197dc 100644 --- a/evals/tasks/ionwave_observe.ts +++ b/evals/tasks/ionwave_observe.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const ionwave_observe: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const ionwave_observe: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://elpasotexas.ionwave.net/Login.aspx"); const observations = await stagehand.page.observe({ onlyVisible: true }); diff --git a/evals/tasks/nextChunk.ts b/evals/tasks/nextChunk.ts index b133d5d83..9b1bc9c29 100644 --- a/evals/tasks/nextChunk.ts +++ b/evals/tasks/nextChunk.ts @@ -1,15 +1,17 @@ -import { initStagehand } from "@/evals/initStagehand"; +import { Stagehand } from "@/dist"; import { EvalFunction } from "@/types/evals"; -export const nextChunk: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, +export const nextChunk: EvalFunction = async ({ + logger, + stagehandConfig, + debugUrl, + sessionUrl, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, domSettleTimeoutMs: 3000, }); - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.apartments.com/san-francisco-ca/"); await stagehand.page.act({ action: "click on the all filters button", diff --git a/evals/tasks/nonsense_action.ts b/evals/tasks/nonsense_action.ts index a5cbdde54..be6f90dbd 100644 --- a/evals/tasks/nonsense_action.ts +++ b/evals/tasks/nonsense_action.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const nonsense_action: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const nonsense_action: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { try { await stagehand.page.goto("https://www.homedepot.com/"); diff --git a/evals/tasks/observe_amazon_add_to_cart.ts b/evals/tasks/observe_amazon_add_to_cart.ts index 8be49da13..39607dbf4 100644 --- a/evals/tasks/observe_amazon_add_to_cart.ts +++ b/evals/tasks/observe_amazon_add_to_cart.ts @@ -1,18 +1,12 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { performPlaywrightMethod } from "@/lib/a11y/utils"; export const observe_amazon_add_to_cart: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://www.amazon.com/Laptop-MacBook-Surface-Water-Resistant-Accessories/dp/B0D5M4H5CD", ); diff --git a/evals/tasks/observe_github.ts b/evals/tasks/observe_github.ts index a20651ada..036255f85 100644 --- a/evals/tasks/observe_github.ts +++ b/evals/tasks/observe_github.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const observe_github: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const observe_github: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://github.com/numpy/numpy/tree/main/numpy"); const observations = await stagehand.page.observe({ diff --git a/evals/tasks/observe_iframes1.ts b/evals/tasks/observe_iframes1.ts index 81ca9ff34..f1cbc68ee 100644 --- a/evals/tasks/observe_iframes1.ts +++ b/evals/tasks/observe_iframes1.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const observe_iframes1: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const observe_iframes1: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); const observations = await stagehand.page.observe({ diff --git a/evals/tasks/observe_iframes2.ts b/evals/tasks/observe_iframes2.ts index e280ae634..90ce03a7a 100644 --- a/evals/tasks/observe_iframes2.ts +++ b/evals/tasks/observe_iframes2.ts @@ -1,14 +1,16 @@ -import { initStagehand } from "@/evals/initStagehand"; +import { Stagehand } from "@/dist"; import { EvalFunction } from "@/types/evals"; -export const observe_iframes2: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, +export const observe_iframes2: EvalFunction = async ({ + logger, + stagehandConfig, + debugUrl, + sessionUrl, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, }); - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://iframetester.com/?url=https://shopify.com", ); diff --git a/evals/tasks/observe_simple_google_search.ts b/evals/tasks/observe_simple_google_search.ts index 16eb38ff0..7e6fd1331 100644 --- a/evals/tasks/observe_simple_google_search.ts +++ b/evals/tasks/observe_simple_google_search.ts @@ -1,23 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { performPlaywrightMethod } from "@/lib/a11y/utils"; export const observe_simple_google_search: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.google.com"); - // await stagehand.page.act({ - // action: 'Search for "OpenAI"', - // }); const observation1 = await stagehand.page.observe({ instruction: "Find the search bar and enter 'OpenAI'", onlyVisible: false, diff --git a/evals/tasks/observe_taxes.ts b/evals/tasks/observe_taxes.ts index 4d5b5fddb..3ba0b3676 100644 --- a/evals/tasks/observe_taxes.ts +++ b/evals/tasks/observe_taxes.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const observe_taxes: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const observe_taxes: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://file.1040.com/estimate/"); const observations = await stagehand.page.observe({ diff --git a/evals/tasks/observe_vantechjournal.ts b/evals/tasks/observe_vantechjournal.ts index 4ca7dbbf9..c3eef6a71 100644 --- a/evals/tasks/observe_vantechjournal.ts +++ b/evals/tasks/observe_vantechjournal.ts @@ -1,17 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const observe_vantechjournal: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://vantechjournal.com/archive?page=8"); await stagehand.page.waitForTimeout(1000); diff --git a/evals/tasks/observe_yc_startup.ts b/evals/tasks/observe_yc_startup.ts index 5dd4bb53d..9470e6323 100644 --- a/evals/tasks/observe_yc_startup.ts +++ b/evals/tasks/observe_yc_startup.ts @@ -1,17 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; export const observe_yc_startup: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.ycombinator.com/companies"); await stagehand.page.waitForLoadState("networkidle"); diff --git a/evals/tasks/panamcs.ts b/evals/tasks/panamcs.ts index a9cb87f73..5447daa98 100644 --- a/evals/tasks/panamcs.ts +++ b/evals/tasks/panamcs.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const panamcs: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const panamcs: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/panamcs/", ); diff --git a/evals/tasks/peeler_complex.ts b/evals/tasks/peeler_complex.ts index 2ea6f1b6e..4b9ad170c 100644 --- a/evals/tasks/peeler_complex.ts +++ b/evals/tasks/peeler_complex.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const peeler_complex: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - try { await stagehand.page.goto(`https://chefstoys.com/`, { timeout: 60000 }); await stagehand.page.waitForLoadState("networkidle"); @@ -33,7 +27,6 @@ export const peeler_complex: EvalFunction = async ({ const { price } = await stagehand.page.extract({ instruction: "get the price of the peeler", schema: z.object({ price: z.number().nullable() }), - modelName, useTextExtract, }); diff --git a/evals/tasks/peeler_simple.ts b/evals/tasks/peeler_simple.ts index f8e6dbaeb..12628c6fa 100644 --- a/evals/tasks/peeler_simple.ts +++ b/evals/tasks/peeler_simple.ts @@ -1,5 +1,4 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { StagehandEnvironmentError } from "@/types/stagehandErrors"; const env: "BROWSERBASE" | "LOCAL" = @@ -7,14 +6,12 @@ const env: "BROWSERBASE" | "LOCAL" = ? "BROWSERBASE" : "LOCAL"; -export const peeler_simple: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - +export const peeler_simple: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { if (env === "BROWSERBASE") { throw new StagehandEnvironmentError( "BROWSERBASE", diff --git a/evals/tasks/prevChunk.ts b/evals/tasks/prevChunk.ts index 5bb3e4977..95f06f814 100644 --- a/evals/tasks/prevChunk.ts +++ b/evals/tasks/prevChunk.ts @@ -1,14 +1,17 @@ -import { initStagehand } from "@/evals/initStagehand"; +import { Stagehand } from "@/dist"; import { EvalFunction } from "@/types/evals"; -export const prevChunk: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, +export const prevChunk: EvalFunction = async ({ + logger, + stagehandConfig, + debugUrl, + sessionUrl, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, domSettleTimeoutMs: 3000, }); - const { debugUrl, sessionUrl } = initResponse; await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); diff --git a/evals/tasks/rakuten_jp.ts b/evals/tasks/rakuten_jp.ts index 03a5a949b..cf8ea55fe 100644 --- a/evals/tasks/rakuten_jp.ts +++ b/evals/tasks/rakuten_jp.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const rakuten_jp: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const rakuten_jp: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://www.rakuten.co.jp/"); await stagehand.page.act({ action: "click on online supermarket" }); diff --git a/evals/tasks/sciquest.ts b/evals/tasks/sciquest.ts index dcab28905..42bc535d7 100644 --- a/evals/tasks/sciquest.ts +++ b/evals/tasks/sciquest.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const sciquest: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://bids.sciquest.com/apps/Router/PublicEvent?tab=PHX_NAV_SourcingAllOpps&CustomerOrg=StateOfUtah", ); @@ -28,7 +22,6 @@ export const sciquest: EvalFunction = async ({ schema: z.object({ total_results: z.string(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/scroll_50.ts b/evals/tasks/scroll_50.ts index ada05ca3d..c6abb53c3 100644 --- a/evals/tasks/scroll_50.ts +++ b/evals/tasks/scroll_50.ts @@ -1,14 +1,11 @@ -import { initStagehand } from "@/evals/initStagehand"; import { EvalFunction } from "@/types/evals"; -export const scroll_50: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - domSettleTimeoutMs: 3000, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const scroll_50: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); diff --git a/evals/tasks/scroll_75.ts b/evals/tasks/scroll_75.ts index 04da73f39..b804eb1e4 100644 --- a/evals/tasks/scroll_75.ts +++ b/evals/tasks/scroll_75.ts @@ -1,14 +1,17 @@ -import { initStagehand } from "@/evals/initStagehand"; +import { Stagehand } from "@/dist"; import { EvalFunction } from "@/types/evals"; -export const scroll_75: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, +export const scroll_75: EvalFunction = async ({ + logger, + stagehandConfig, + debugUrl, + sessionUrl, +}) => { + const stagehand = new Stagehand({ + ...stagehandConfig, domSettleTimeoutMs: 3000, }); - const { debugUrl, sessionUrl } = initResponse; await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); diff --git a/evals/tasks/shopify_homepage.ts b/evals/tasks/shopify_homepage.ts index e846422e1..03bd7f1fe 100644 --- a/evals/tasks/shopify_homepage.ts +++ b/evals/tasks/shopify_homepage.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const shopify_homepage: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const shopify_homepage: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://www.shopify.com/"); const observations = await stagehand.page.observe({ onlyVisible: true }); diff --git a/evals/tasks/simple_google_search.ts b/evals/tasks/simple_google_search.ts index dc35068c3..e6a681200 100644 --- a/evals/tasks/simple_google_search.ts +++ b/evals/tasks/simple_google_search.ts @@ -1,17 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; export const simple_google_search: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.google.com"); await stagehand.page.act({ diff --git a/evals/tasks/stock_x.ts b/evals/tasks/stock_x.ts index 878e6fa50..b1459b0a6 100644 --- a/evals/tasks/stock_x.ts +++ b/evals/tasks/stock_x.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const stock_x: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const stock_x: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto( "https://stockx.com/air-jordan-3-retro-black-cement-2024", ); diff --git a/evals/tasks/ted_talk.ts b/evals/tasks/ted_talk.ts index 7b0253c6c..d288df8f6 100644 --- a/evals/tasks/ted_talk.ts +++ b/evals/tasks/ted_talk.ts @@ -1,20 +1,14 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { normalizeString } from "@/evals/utils"; import { z } from "zod"; export const ted_talk: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto( "https://www.ted.com/talks/sir_ken_robinson_do_schools_kill_creativity", { @@ -39,7 +33,6 @@ export const ted_talk: EvalFunction = async ({ ) .describe("List of culture video playlists"), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/vanta.ts b/evals/tasks/vanta.ts index 2959389c4..016127392 100644 --- a/evals/tasks/vanta.ts +++ b/evals/tasks/vanta.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const vanta: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const vanta: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://www.vanta.com/"); await stagehand.page.act({ action: "close the cookies popup" }); diff --git a/evals/tasks/vanta_h.ts b/evals/tasks/vanta_h.ts index eca69a0fc..1eb139002 100644 --- a/evals/tasks/vanta_h.ts +++ b/evals/tasks/vanta_h.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const vanta_h: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const vanta_h: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://www.vanta.com/"); const observations = await stagehand.page.observe({ diff --git a/evals/tasks/vantechjournal.ts b/evals/tasks/vantechjournal.ts index 2922727d6..c176142b0 100644 --- a/evals/tasks/vantechjournal.ts +++ b/evals/tasks/vantechjournal.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const vantechjournal: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const vantechjournal: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto("https://vantechjournal.com/"); await stagehand.page.act({ diff --git a/evals/tasks/wichita.ts b/evals/tasks/wichita.ts index 3a56c9d39..fe645b480 100644 --- a/evals/tasks/wichita.ts +++ b/evals/tasks/wichita.ts @@ -1,19 +1,13 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; import { z } from "zod"; export const wichita: EvalFunction = async ({ - modelName, + debugUrl, + sessionUrl, + stagehand, logger, useTextExtract, }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; - await stagehand.page.goto("https://www.wichitafallstx.gov/Bids.aspx"); await stagehand.page.act({ @@ -25,7 +19,6 @@ export const wichita: EvalFunction = async ({ schema: z.object({ total_results: z.string(), }), - modelName, useTextExtract, }); diff --git a/evals/tasks/wikipedia.ts b/evals/tasks/wikipedia.ts index 60178e5cf..5dad05d2f 100644 --- a/evals/tasks/wikipedia.ts +++ b/evals/tasks/wikipedia.ts @@ -1,14 +1,11 @@ import { EvalFunction } from "@/types/evals"; -import { initStagehand } from "@/evals/initStagehand"; - -export const wikipedia: EvalFunction = async ({ modelName, logger }) => { - const { stagehand, initResponse } = await initStagehand({ - modelName, - logger, - }); - - const { debugUrl, sessionUrl } = initResponse; +export const wikipedia: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, +}) => { await stagehand.page.goto(`https://en.wikipedia.org/wiki/Baseball`); await stagehand.page.act({ action: 'click the "hit and run" link in this article', diff --git a/examples/external_clients/customOpenAI.ts b/examples/external_clients/customOpenAI.ts index 6a6d70b3f..a87ab8742 100644 --- a/examples/external_clients/customOpenAI.ts +++ b/examples/external_clients/customOpenAI.ts @@ -45,6 +45,29 @@ export class CustomOpenAIClient extends LLMClient { retries = 3, logger, }: CreateChatCompletionOptions): Promise { + logger({ + category: "openai", + message: "starting chat completion", + level: 1, + auxiliary: { + options: { + value: JSON.stringify(options), + type: "object", + }, + client: { + value: JSON.stringify({ + apiKey: this.client.apiKey, + baseURL: this.client.baseURL, + }), + type: "object", + }, + modelName: { + value: this.modelName, + type: "string", + }, + }, + }); + const { image, requestId, ...optionsWithoutImageAndRequestId } = options; // TODO: Implement vision support diff --git a/lib/StagehandPage.ts b/lib/StagehandPage.ts index 5f1c7eb9d..6d2aad423 100644 --- a/lib/StagehandPage.ts +++ b/lib/StagehandPage.ts @@ -368,7 +368,7 @@ export class StagehandPage { if (err instanceof StagehandError || err instanceof StagehandAPIError) { throw err; } - throw new StagehandDefaultError(); + throw new StagehandDefaultError(err); } } @@ -559,7 +559,7 @@ export class StagehandPage { if (err instanceof StagehandError || err instanceof StagehandAPIError) { throw err; } - throw new StagehandDefaultError(); + throw new StagehandDefaultError(err); } } @@ -683,7 +683,7 @@ export class StagehandPage { if (err instanceof StagehandError || err instanceof StagehandAPIError) { throw err; } - throw new StagehandDefaultError(); + throw new StagehandDefaultError(err); } } @@ -796,7 +796,7 @@ export class StagehandPage { if (err instanceof StagehandError || err instanceof StagehandAPIError) { throw err; } - throw new StagehandDefaultError(); + throw new StagehandDefaultError(err); } } diff --git a/stagehand.config.ts b/stagehand.config.ts index 672352ffb..4c901799b 100644 --- a/stagehand.config.ts +++ b/stagehand.config.ts @@ -3,7 +3,7 @@ import dotenv from "dotenv"; dotenv.config(); const StagehandConfig: ConstructorParams = { - verbose: 1 /* Verbosity level for logging: 0 = silent, 1 = info, 2 = all */, + verbose: 2 /* Verbosity level for logging: 0 = silent, 1 = info, 2 = all */, domSettleTimeoutMs: 30_000 /* Timeout for DOM to settle in milliseconds */, // LLM configuration diff --git a/types/evals.ts b/types/evals.ts index 400d52800..efb8c0da5 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -1,15 +1,10 @@ import { z } from "zod"; -import type { EvalLogger } from "../evals/logger"; import type { AvailableModel } from "../types/model"; import type { LogLine } from "../types/log"; import type { EvalCase } from "braintrust"; +import { StagehandInitResult } from "@/evals/initStagehand"; -export type EvalFunction = (args: { - modelName: AvailableModel; - logger: EvalLogger; - useTextExtract: boolean; - useAccessibilityTree: boolean; -}) => Promise<{ +export type EvalFunction = (taskInput: StagehandInitResult) => Promise<{ _success: boolean; logs: LogLine[]; debugUrl: string; @@ -28,6 +23,8 @@ export const EvalCategorySchema = z.enum([ "regression_text_extract", "regression_dom_extract", "regression_llm_providers", + "llm_clients", + "unit", ]); export type EvalCategory = z.infer; diff --git a/types/stagehandErrors.ts b/types/stagehandErrors.ts index e5efeb2ed..83b2bd269 100644 --- a/types/stagehandErrors.ts +++ b/types/stagehandErrors.ts @@ -6,10 +6,12 @@ export class StagehandError extends Error { } export class StagehandDefaultError extends StagehandError { - constructor() { - super( - `\nHey! We're sorry you ran into an error. \nIf you need help, please open a Github issue or reach out to us on Slack: https://stagehand.dev/slack\n`, - ); + constructor(error?: unknown) { + if (error instanceof Error || error instanceof StagehandError) { + super( + `\nHey! We're sorry you ran into an error. \nIf you need help, please open a Github issue or reach out to us on Slack: https://stagehand.dev/slack\n\nFull error:\n${error.message}`, + ); + } } } From 4270b1a0b4d33241a36e6a57900a07a489383e7b Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Fri, 4 Apr 2025 19:21:05 -0700 Subject: [PATCH 02/25] temp --- evals/tasks/instructions.ts | 2 +- evals/tasks/nextChunk.ts | 1 + evals/tasks/observe_iframes2.ts | 1 + evals/tasks/prevChunk.ts | 1 + evals/tasks/scroll_75.ts | 1 + examples/external_clients/customOpenAI.ts | 5 +---- 6 files changed, 6 insertions(+), 5 deletions(-) diff --git a/evals/tasks/instructions.ts b/evals/tasks/instructions.ts index 1142ca862..75e4955b7 100644 --- a/evals/tasks/instructions.ts +++ b/evals/tasks/instructions.ts @@ -20,7 +20,7 @@ export const instructions: EvalFunction = async ({ const url = page.url(); const isCorrectUrl = - url === "https://docs.browserbase.com/introduction/getting-started"; + url === "https://docs.browserbase.com/introduction/what-is-browserbase"; return { _success: isCorrectUrl, diff --git a/evals/tasks/nextChunk.ts b/evals/tasks/nextChunk.ts index 9b1bc9c29..6591cb8c5 100644 --- a/evals/tasks/nextChunk.ts +++ b/evals/tasks/nextChunk.ts @@ -11,6 +11,7 @@ export const nextChunk: EvalFunction = async ({ ...stagehandConfig, domSettleTimeoutMs: 3000, }); + await stagehand.init(); await stagehand.page.goto("https://www.apartments.com/san-francisco-ca/"); await stagehand.page.act({ diff --git a/evals/tasks/observe_iframes2.ts b/evals/tasks/observe_iframes2.ts index 90ce03a7a..3934449ad 100644 --- a/evals/tasks/observe_iframes2.ts +++ b/evals/tasks/observe_iframes2.ts @@ -10,6 +10,7 @@ export const observe_iframes2: EvalFunction = async ({ const stagehand = new Stagehand({ ...stagehandConfig, }); + await stagehand.init(); await stagehand.page.goto( "https://iframetester.com/?url=https://shopify.com", diff --git a/evals/tasks/prevChunk.ts b/evals/tasks/prevChunk.ts index 95f06f814..0db2266bf 100644 --- a/evals/tasks/prevChunk.ts +++ b/evals/tasks/prevChunk.ts @@ -11,6 +11,7 @@ export const prevChunk: EvalFunction = async ({ ...stagehandConfig, domSettleTimeoutMs: 3000, }); + await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", diff --git a/evals/tasks/scroll_75.ts b/evals/tasks/scroll_75.ts index b804eb1e4..3959837ca 100644 --- a/evals/tasks/scroll_75.ts +++ b/evals/tasks/scroll_75.ts @@ -11,6 +11,7 @@ export const scroll_75: EvalFunction = async ({ ...stagehandConfig, domSettleTimeoutMs: 3000, }); + await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", diff --git a/examples/external_clients/customOpenAI.ts b/examples/external_clients/customOpenAI.ts index a87ab8742..0d4c35c3a 100644 --- a/examples/external_clients/customOpenAI.ts +++ b/examples/external_clients/customOpenAI.ts @@ -50,12 +50,9 @@ export class CustomOpenAIClient extends LLMClient { message: "starting chat completion", level: 1, auxiliary: { - options: { - value: JSON.stringify(options), - type: "object", - }, client: { value: JSON.stringify({ + modelName: this.modelName, apiKey: this.client.apiKey, baseURL: this.client.baseURL, }), From 2dfabd78483f5f90c1df7cba5289214491f68550 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Fri, 4 Apr 2025 19:23:19 -0700 Subject: [PATCH 03/25] custom openai --- examples/external_clients/customOpenAI.ts | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/examples/external_clients/customOpenAI.ts b/examples/external_clients/customOpenAI.ts index 0d4c35c3a..6a6d70b3f 100644 --- a/examples/external_clients/customOpenAI.ts +++ b/examples/external_clients/customOpenAI.ts @@ -45,26 +45,6 @@ export class CustomOpenAIClient extends LLMClient { retries = 3, logger, }: CreateChatCompletionOptions): Promise { - logger({ - category: "openai", - message: "starting chat completion", - level: 1, - auxiliary: { - client: { - value: JSON.stringify({ - modelName: this.modelName, - apiKey: this.client.apiKey, - baseURL: this.client.baseURL, - }), - type: "object", - }, - modelName: { - value: this.modelName, - type: "string", - }, - }, - }); - const { image, requestId, ...optionsWithoutImageAndRequestId } = options; // TODO: Implement vision support From b90071115451a82bf140a4d9a517291298871e3f Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Fri, 4 Apr 2025 22:48:23 -0700 Subject: [PATCH 04/25] unit and evals --- evals/evals.config.json | 18 ++-- evals/index.eval.ts | 24 +++-- evals/initStagehand.ts | 3 +- evals/logger.ts | 41 ++++---- evals/taskConfig.ts | 15 +-- evals/tasks/allrecipes.ts | 2 +- package-lock.json | 192 ++++++++++++++++++++++++++++++-------- package.json | 8 +- 8 files changed, 219 insertions(+), 84 deletions(-) diff --git a/evals/evals.config.json b/evals/evals.config.json index 9ad4fd8f5..ba7fb31e4 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -6,7 +6,7 @@ }, { "name": "expect_act_timeout", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "extract_repo_name", @@ -18,7 +18,7 @@ }, { "name": "instructions", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "bidnet", @@ -71,7 +71,7 @@ }, { "name": "peeler_complex", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "sciquest", @@ -257,11 +257,11 @@ }, { "name": "observe_iframes1", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "observe_iframes2", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "extract_hamilton_weather", @@ -293,19 +293,19 @@ }, { "name": "scroll_50", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "scroll_75", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "nextChunk", - "categories": ["unit"] + "categories": ["unit", "combination"] }, { "name": "prevChunk", - "categories": ["unit"] + "categories": ["unit", "combination"] } ] } diff --git a/evals/index.eval.ts b/evals/index.eval.ts index e429c4b6e..79538f14b 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -19,16 +19,19 @@ import { filterByCategory, filterByEvalName, useTextExtract } from "./args"; import { generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; import { tasksByName, MODELS } from "./taskConfig"; -import { Eval } from "braintrust"; +import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust"; import { EvalFunction, SummaryResult, Testcase } from "@/types/evals"; import { EvalLogger } from "./logger"; -import { AvailableModel } from "@/dist"; +import { AvailableModel, LLMClient } from "@/dist"; import { env } from "./env"; import dotenv from "dotenv"; import { StagehandEvalError } from "@/types/stagehandErrors"; import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; import OpenAI from "openai"; import { initStagehand } from "./initStagehand"; +import { AISdkClient } from "@/examples/external_clients/aisdk"; +import { google } from "@ai-sdk/google"; + dotenv.config(); /** @@ -230,13 +233,20 @@ const generateFilteredTestcases = (): Testcase[] => { } // Execute the task - const llmClient = new CustomOpenAIClient({ + let llmClient: LLMClient = new CustomOpenAIClient({ modelName: input.modelName as AvailableModel, - client: new OpenAI({ - apiKey: process.env.BRAINTRUST_API_KEY, - baseURL: "https://api.braintrust.dev/v1/proxy", - }), + client: wrapOpenAI( + new OpenAI({ + apiKey: process.env.BRAINTRUST_API_KEY, + baseURL: "https://api.braintrust.dev/v1/proxy", + }), + ), }); + if (input.modelName.startsWith("gemini")) { + llmClient = new AISdkClient({ + model: wrapAISDKModel(google("gemini-2.0-flash")), + }); + } const taskInput = await initStagehand({ logger, llmClient, diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts index 7769bb6a0..ab9c16856 100644 --- a/evals/initStagehand.ts +++ b/evals/initStagehand.ts @@ -40,6 +40,7 @@ const StagehandConfig = { headless: false, enableCaching, domSettleTimeoutMs: 30_000, + disablePino: true, }; /** @@ -74,7 +75,7 @@ export const initStagehand = async ({ ...(domSettleTimeoutMs && { domSettleTimeoutMs }), actTimeoutMs, ...configOverrides, - logger: logger.log, + logger: logger.log.bind(logger), }; const stagehand = new Stagehand(config); diff --git a/evals/logger.ts b/evals/logger.ts index d07ddfcbe..c234ada44 100644 --- a/evals/logger.ts +++ b/evals/logger.ts @@ -23,18 +23,27 @@ import { Stagehand, LogLine } from "@/dist"; */ function parseLogLine(logLine: LogLine): LogLineEval { try { + let parsedAuxiliary: Record | undefined; + + if (logLine.auxiliary) { + parsedAuxiliary = {}; + + for (const [key, entry] of Object.entries(logLine.auxiliary)) { + try { + parsedAuxiliary[key] = + entry.type === "object" ? JSON.parse(entry.value) : entry.value; + } catch (parseError) { + console.warn(`Failed to parse auxiliary entry ${key}:`, parseError); + // If parsing fails, use the raw value + parsedAuxiliary[key] = entry.value; + } + } + } + return { ...logLine, - // Remove the original auxiliary field in favor of parsedAuxiliary auxiliary: undefined, - parsedAuxiliary: logLine.auxiliary - ? Object.fromEntries( - Object.entries(logLine.auxiliary).map(([key, entry]) => [ - key, - entry.type === "object" ? JSON.parse(entry.value) : entry.value, - ]), - ) - : undefined, + parsedAuxiliary, } as LogLineEval; } catch (e) { console.log("Error parsing log line", logLine); @@ -56,10 +65,12 @@ function parseLogLine(logLine: LogLine): LogLineEval { * included in evaluation output. */ export class EvalLogger { - logs: LogLineEval[] = []; + private logs: LogLineEval[] = []; stagehand?: Stagehand; - constructor() {} + constructor() { + this.logs = []; + } /** * init: @@ -78,9 +89,7 @@ export class EvalLogger { */ log(logLine: LogLine) { console.log(logLineToString(logLine)); - if (this.logs) { - this.logs.push(parseLogLine(logLine)); - } + this.logs.push(parseLogLine(logLine)); } /** @@ -108,7 +117,7 @@ export class EvalLogger { * Retrieves the array of stored log lines. * Useful for returning logs after a task completes, for analysis or debugging. */ - getLogs() { - return this.logs; + getLogs(): LogLineEval[] { + return this.logs || []; } } diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index 03dff1a24..9b8a13177 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -12,9 +12,8 @@ import fs from "fs"; import path from "path"; -import { AvailableModel, AvailableModelSchema } from "@/dist"; +import { AvailableModel } from "@/dist"; import { filterByEvalName } from "./args"; -import { UnsupportedModelError } from "@/types/stagehandErrors"; // The configuration file `evals.config.json` contains a list of tasks and their associated categories. const configPath = path.join(__dirname, "evals.config.json"); @@ -50,7 +49,14 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") - : ["gpt-4o"]; + : [ + "gemini-2.0-flash", + // "claude-3-5-sonnet-20240620", + // "gpt-4o-mini", + // "gpt-4o", + // "llama-3.3-70b-versatile", + // "llama3.3-70b", + ]; /** * getModelList: @@ -62,9 +68,6 @@ const getModelList = (): string[] => { return DEFAULT_EVAL_MODELS; }; const MODELS: AvailableModel[] = getModelList().map((model) => { - if (!AvailableModelSchema.safeParse(model).success) { - throw new UnsupportedModelError(getModelList(), "Running evals"); - } return model as AvailableModel; }); diff --git a/evals/tasks/allrecipes.ts b/evals/tasks/allrecipes.ts index 5f515b48d..9d2b193cf 100644 --- a/evals/tasks/allrecipes.ts +++ b/evals/tasks/allrecipes.ts @@ -16,7 +16,7 @@ export const allrecipes: EvalFunction = async ({ action: 'Type "chocolate chip cookies" in the search bar', }); await stagehand.page.act({ - action: "hit enter", + action: "click the search icon button", }); const recipeDetails = await stagehand.page.extract({ diff --git a/package-lock.json b/package-lock.json index f8590e583..f9404ff6e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@browserbasehq/stagehand", - "version": "1.14.0", + "version": "2.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@browserbasehq/stagehand", - "version": "1.14.0", + "version": "2.0.0", "license": "MIT", "dependencies": { "@anthropic-ai/sdk": "0.39.0", @@ -15,7 +15,7 @@ "zod-to-json-schema": "^3.23.5" }, "devDependencies": { - "@ai-sdk/google": "^1.0.13", + "@ai-sdk/google": "^1.2.6", "@ai-sdk/openai": "^1.0.14", "@changesets/changelog-github": "^0.5.0", "@changesets/cli": "^2.27.9", @@ -28,7 +28,7 @@ "@types/node": "^20.11.30", "@types/ws": "^8.5.13", "adm-zip": "^0.5.16", - "ai": "^4.0.26", + "ai": "^4.3.0", "autoevals": "^0.0.64", "braintrust": "^0.0.171", "chalk": "^5.4.1", @@ -57,14 +57,14 @@ } }, "node_modules/@ai-sdk/google": { - "version": "1.1.22", - "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-1.1.22.tgz", - "integrity": "sha512-z6k+Z/aJMazxQ5cxJ7nPOw8QsKtaTuNb6aX89g5rrv4FJUs/IT051LbmEw4+uhCLgVllSjQ5SidvN9qw1mdMtg==", + "version": "1.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-1.2.6.tgz", + "integrity": "sha512-e6vl+hmz7xZzWmsZZkLv89TZc19Vjgqj+RgvJNg03npRiuG4f1R/He1PD/JX6f0az//Y55CcozCcaj4vnMz6gQ==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@ai-sdk/provider": "1.0.10", - "@ai-sdk/provider-utils": "2.1.12" + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4" }, "engines": { "node": ">=18" @@ -73,6 +73,37 @@ "zod": "^3.0.0" } }, + "node_modules/@ai-sdk/google/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/google/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ai-sdk/openai": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.2.3.tgz", @@ -128,14 +159,14 @@ } }, "node_modules/@ai-sdk/react": { - "version": "1.1.22", - "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.1.22.tgz", - "integrity": "sha512-+ZlmJBu9NH59wvAtuh+xXEJ2MEfisBpVbYwN347+VlJJ3LfHVH9Rp1d58jVwBsfJvDUMn5UMuHZIXIdQhJIBTg==", + "version": "1.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.2.6.tgz", + "integrity": "sha512-5BFChNbcYtcY9MBStcDev7WZRHf0NpTrk8yfSoedWctB3jfWkFd1HECBvdc8w3mUQshF2MumLHtAhRO7IFtGGQ==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@ai-sdk/provider-utils": "2.1.12", - "@ai-sdk/ui-utils": "1.1.18", + "@ai-sdk/provider-utils": "2.2.4", + "@ai-sdk/ui-utils": "1.2.5", "swr": "^2.2.5", "throttleit": "2.1.0" }, @@ -144,17 +175,45 @@ }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", - "zod": "^3.0.0" + "zod": "^3.23.8" }, "peerDependenciesMeta": { - "react": { - "optional": true - }, "zod": { "optional": true } } }, + "node_modules/@ai-sdk/react/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/react/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ai-sdk/solid": { "version": "0.0.54", "resolved": "https://registry.npmjs.org/@ai-sdk/solid/-/solid-0.0.54.tgz", @@ -345,26 +404,52 @@ } }, "node_modules/@ai-sdk/ui-utils": { - "version": "1.1.18", - "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.1.18.tgz", - "integrity": "sha512-PCDXKKKHqA8Oqm5LsXl3Byxmit0r0Gg3nMPI3bdEriDIExoQikULX2T6/IS5u1qNeoC3UK5F2acpCyl5Q+aIuQ==", + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.5.tgz", + "integrity": "sha512-XDgqnJcaCkDez7qolvk+PDbs/ceJvgkNkxkOlc9uDWqxfDJxtvCZ+14MP/1qr4IBwGIgKVHzMDYDXvqVhSWLzg==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@ai-sdk/provider": "1.0.10", - "@ai-sdk/provider-utils": "2.1.12", + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4", "zod-to-json-schema": "^3.24.1" }, "engines": { "node": ">=18" }, "peerDependencies": { - "zod": "^3.0.0" + "zod": "^3.23.8" + } + }, + "node_modules/@ai-sdk/ui-utils/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" }, - "peerDependenciesMeta": { - "zod": { - "optional": true - } + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/ui-utils/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" } }, "node_modules/@ai-sdk/vue": { @@ -2847,18 +2932,17 @@ } }, "node_modules/ai": { - "version": "4.1.58", - "resolved": "https://registry.npmjs.org/ai/-/ai-4.1.58.tgz", - "integrity": "sha512-ugOL7Py5GwlVVhqOW2UAK2ouUgzMepK8xkJtW4EXvK2CllYvNdK/2SG2sppdo7YJYSPqtlwbvi8BsDYNDwFlgg==", + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.0.tgz", + "integrity": "sha512-PxyQYKhWaU3LiZEpeKRaekVonZIbWdKAwgnqm0CSAxy1MFufmYEC5SM5Mc9uiK2DoHcbAL3d1jyaQ2fSDAJL8w==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@ai-sdk/provider": "1.0.10", - "@ai-sdk/provider-utils": "2.1.12", - "@ai-sdk/react": "1.1.22", - "@ai-sdk/ui-utils": "1.1.18", + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4", + "@ai-sdk/react": "1.2.6", + "@ai-sdk/ui-utils": "1.2.5", "@opentelemetry/api": "1.9.0", - "eventsource-parser": "^3.0.0", "jsondiffpatch": "0.6.0" }, "engines": { @@ -2866,17 +2950,45 @@ }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", - "zod": "^3.0.0" + "zod": "^3.23.8" }, "peerDependenciesMeta": { "react": { "optional": true - }, - "zod": { - "optional": true } } }, + "node_modules/ai/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/ai/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", diff --git a/package.json b/package.json index 5f596c0f7..a4b1c5c0d 100644 --- a/package.json +++ b/package.json @@ -44,7 +44,7 @@ "author": "Browserbase", "license": "MIT", "devDependencies": { - "@ai-sdk/google": "^1.0.13", + "@ai-sdk/google": "^1.2.6", "@ai-sdk/openai": "^1.0.14", "@changesets/changelog-github": "^0.5.0", "@changesets/cli": "^2.27.9", @@ -57,7 +57,7 @@ "@types/node": "^20.11.30", "@types/ws": "^8.5.13", "adm-zip": "^0.5.16", - "ai": "^4.0.26", + "ai": "^4.3.0", "autoevals": "^0.0.64", "braintrust": "^0.0.171", "chalk": "^5.4.1", @@ -80,9 +80,9 @@ "deepmerge": "^4.3.1", "dotenv": "^16.4.5", "openai": "^4.87.1", - "zod": "^3.23.8", "pino": "^9.6.0", - "pino-pretty": "^13.0.0" + "pino-pretty": "^13.0.0", + "zod": "^3.23.8" }, "dependencies": { "@anthropic-ai/sdk": "0.39.0", From be5449a73cafbc71075bb661d616155a951e27d6 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Fri, 4 Apr 2025 22:59:10 -0700 Subject: [PATCH 05/25] changeset --- .changeset/stupid-ghosts-smash.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/stupid-ghosts-smash.md diff --git a/.changeset/stupid-ghosts-smash.md b/.changeset/stupid-ghosts-smash.md new file mode 100644 index 000000000..ffb20366a --- /dev/null +++ b/.changeset/stupid-ghosts-smash.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Fix: forward along the stack trace in StagehandDefaultError From 70ed0672234e46ec164b1b81a99923700a715ddf Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Sat, 5 Apr 2025 20:48:36 -0700 Subject: [PATCH 06/25] fix evals config --- evals/evals.config.json | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/evals/evals.config.json b/evals/evals.config.json index ba7fb31e4..910c1ec79 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -6,7 +6,7 @@ }, { "name": "expect_act_timeout", - "categories": ["unit", "combination"] + "categories": ["unit"] }, { "name": "extract_repo_name", @@ -14,7 +14,7 @@ }, { "name": "amazon_add_to_cart", - "categories": ["act"] + "categories": ["experimental"] }, { "name": "instructions", @@ -199,14 +199,6 @@ "name": "panamcs", "categories": ["observe"] }, - { - "name": "shopify_homepage", - "categories": ["observe"] - }, - { - "name": "vanta", - "categories": ["observe"] - }, { "name": "vanta_h", "categories": ["observe"] From 1a3d739fc1ffa02e9018d9043c639c32ce717a77 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Sat, 5 Apr 2025 20:51:11 -0700 Subject: [PATCH 07/25] fix evals config --- evals/evals.config.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/evals/evals.config.json b/evals/evals.config.json index 910c1ec79..1936cb69d 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -249,11 +249,11 @@ }, { "name": "observe_iframes1", - "categories": ["unit", "combination"] + "categories": ["unit", "observe"] }, { "name": "observe_iframes2", - "categories": ["unit", "combination"] + "categories": ["unit", "observe"] }, { "name": "extract_hamilton_weather", @@ -285,19 +285,19 @@ }, { "name": "scroll_50", - "categories": ["unit", "combination"] + "categories": ["unit", "act"] }, { "name": "scroll_75", - "categories": ["unit", "combination"] + "categories": ["unit", "act"] }, { "name": "nextChunk", - "categories": ["unit", "combination"] + "categories": ["unit", "act"] }, { "name": "prevChunk", - "categories": ["unit", "combination"] + "categories": ["unit", "act"] } ] } From a1e632066b846b5355f37bb6d17c309bbf2ebc8e Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Thu, 10 Apr 2025 09:08:19 -0700 Subject: [PATCH 08/25] Update evals/taskConfig.ts Co-authored-by: Sean McGuire <75873287+seanmcguire12@users.noreply.github.com> --- evals/taskConfig.ts | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index 9b8a13177..f950d6b0e 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -50,12 +50,9 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") : [ - "gemini-2.0-flash", - // "claude-3-5-sonnet-20240620", - // "gpt-4o-mini", - // "gpt-4o", - // "llama-3.3-70b-versatile", - // "llama3.3-70b", + "claude-3-5-sonnet-latest", + "gpt-4o-mini", + "gpt-4o", ]; /** From 367d9d89a3591fb06bb9502faecc7ff785abbefb Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Thu, 10 Apr 2025 09:11:17 -0700 Subject: [PATCH 09/25] rebase --- evals/taskConfig.ts | 82 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 10 deletions(-) diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index f950d6b0e..ac54ae506 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -15,9 +15,49 @@ import path from "path"; import { AvailableModel } from "@/dist"; import { filterByEvalName } from "./args"; +const ALL_EVAL_MODELS = [ + // GOOGLE + "gemini-2.0-flash", + "gemini-2.0-flash-lite", + "gemini-1.5-flash", + "gemini-2.5-pro-exp-03-25", + "gemini-1.5-pro", + "gemini-1.5-flash-8b", + // ANTHROPIC + "claude-3-5-sonnet-latest", + "claude-3-7-sonnet-latest", + // OPENAI + "gpt-4o-mini", + "gpt-4o", + "gpt-4.5-preview", + // TOGETHER - META + "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + // TOGETHER - DEEPSEEK + "deepseek-ai/DeepSeek-V3", + "Qwen/Qwen2.5-7B-Instruct-Turbo", + // GROQ + "groq/meta-llama/llama-4-scout-17b-16e-instruct", + "groq/llama-3.3-70b-versatile", + "groq/llama3-70b-8192", + "groq/qwen-qwq-32b", + "groq/qwen-2.5-32b", + "groq/deepseek-r1-distill-qwen-32b", + "groq/deepseek-r1-distill-llama-70b", + // CEREBRAS + "cerebras/llama3.3-70b", +]; + // The configuration file `evals.config.json` contains a list of tasks and their associated categories. const configPath = path.join(__dirname, "evals.config.json"); -const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); +const config = JSON.parse(fs.readFileSync(configPath, "utf-8")) satisfies { + tasks: { + name: string; + categories: string[]; + }[]; +}; /** * The `tasksConfig` defines all tasks from the config file. Each task has a name and categories. @@ -45,27 +85,49 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { * Determine which models to run the evaluations against. * * DEFAULT_EVAL_MODELS: The default set of models used for most categories. - * EXPERIMENTAL_EVAL_MODELS: Additional models included if the category is "experimental". */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") - : [ - "claude-3-5-sonnet-latest", - "gpt-4o-mini", - "gpt-4o", - ]; + : ["claude-3-5-sonnet-latest", "gpt-4o-mini", "gpt-4o"]; /** * getModelList: * Returns a list of models to be used for the given category. * If category is "experimental", it merges DEFAULT_EVAL_MODELS and EXPERIMENTAL_EVAL_MODELS. - * Otherwise, returns DEFAULT_EVAL_MODELS. + * Otherwise, returns DEFAULT_EVAL_MODELS filtered by provider if specified. */ const getModelList = (): string[] => { - return DEFAULT_EVAL_MODELS; + const provider = process.env.EVAL_PROVIDER?.toLowerCase(); + + if (!provider) { + return DEFAULT_EVAL_MODELS; + } + + return ALL_EVAL_MODELS.filter((model) => { + const modelLower = model.toLowerCase(); + if (provider === "openai") { + return modelLower.startsWith("gpt"); + } else if (provider === "anthropic") { + return modelLower.startsWith("claude"); + } else if (provider === "google") { + return modelLower.startsWith("gemini"); + } else if (provider === "together") { + return ( + modelLower.startsWith("meta-llama") || + modelLower.startsWith("llama") || + modelLower.startsWith("deepseek") || + modelLower.startsWith("qwen") + ); + } else if (provider === "groq") { + return modelLower.startsWith("groq"); + } else if (provider === "cerebras") { + return modelLower.startsWith("cerebras"); + } + return true; + }); }; const MODELS: AvailableModel[] = getModelList().map((model) => { return model as AvailableModel; }); -export { tasksByName, MODELS, config }; +export { tasksByName, MODELS, tasksConfig }; From 7da5f54b38156ad81b9591af190635e2aea0b4f9 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Mon, 7 Apr 2025 16:50:48 -0700 Subject: [PATCH 10/25] temp --- evals/args.ts | 24 ++- evals/index.eval.ts | 77 ++++++- evals/tasks/amazon_add_to_cart.ts | 19 +- evals/tasks/extract_collaborators.ts | 24 ++- evals/tasks/observe_amazon_add_to_cart.ts | 19 +- evals/tasks/observe_yc_startup.ts | 2 +- evals/tasks/panamcs.ts | 6 +- evals/tasks/shopify_homepage.ts | 62 ------ evals/tasks/vanta.ts | 63 ------ evals/tasks/vanta_h.ts | 7 +- package-lock.json | 246 ++++++++++++++++++++++ package.json | 4 + 12 files changed, 372 insertions(+), 181 deletions(-) delete mode 100644 evals/tasks/shopify_homepage.ts delete mode 100644 evals/tasks/vanta.ts diff --git a/evals/args.ts b/evals/args.ts index ea06b6d47..2b990d269 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -8,6 +8,7 @@ const parsedArgs: { trials?: number; concurrency?: number; extractMethod?: string; + provider?: string; leftover: string[]; } = { leftover: [], @@ -28,6 +29,8 @@ for (const arg of rawArgs) { } } else if (arg.startsWith("--extract-method=")) { parsedArgs.extractMethod = arg.split("=")[1]; + } else if (arg.startsWith("provider=")) { + parsedArgs.provider = arg.split("=")[1]?.toLowerCase(); } else { parsedArgs.leftover.push(arg); } @@ -60,14 +63,14 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES "act", "combination", "extract", - "experimental", - "text_extract", - "targeted_extract", - "regression_llm_providers", - "regression_text_extract", - "regression_dom_extract", - "llm_clients", - "unit", + // "experimental", + // "text_extract", + // "targeted_extract", + // "regression_llm_providers", + // "regression_text_extract", + // "regression_dom_extract", + // "llm_clients", + // "unit", ]; // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name @@ -95,10 +98,15 @@ if (parsedArgs.leftover.length > 0) { } } +if (parsedArgs.provider !== undefined) { + process.env.EVAL_PROVIDER = parsedArgs.provider; +} + export { filterByCategory, filterByEvalName, useTextExtract, useAccessibilityTree, DEFAULT_EVAL_CATEGORIES, + parsedArgs, }; diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 79538f14b..fb29d7536 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -15,10 +15,15 @@ import fs from "fs"; import path from "path"; import process from "process"; -import { filterByCategory, filterByEvalName, useTextExtract } from "./args"; +import { + DEFAULT_EVAL_CATEGORIES, + filterByCategory, + filterByEvalName, + useTextExtract, +} from "./args"; import { generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; -import { tasksByName, MODELS } from "./taskConfig"; +import { tasksByName, MODELS, tasksConfig } from "./taskConfig"; import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust"; import { EvalFunction, SummaryResult, Testcase } from "@/types/evals"; import { EvalLogger } from "./logger"; @@ -31,7 +36,9 @@ import OpenAI from "openai"; import { initStagehand } from "./initStagehand"; import { AISdkClient } from "@/examples/external_clients/aisdk"; import { google } from "@ai-sdk/google"; - +import { anthropic } from "@ai-sdk/anthropic"; +import { groq } from "@ai-sdk/groq"; +import { cerebras } from "@ai-sdk/cerebras"; dotenv.config(); /** @@ -40,11 +47,11 @@ dotenv.config(); */ const MAX_CONCURRENCY = process.env.EVAL_MAX_CONCURRENCY ? parseInt(process.env.EVAL_MAX_CONCURRENCY, 10) - : 20; + : 3; const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT ? parseInt(process.env.EVAL_TRIAL_COUNT, 10) - : 5; + : 3; /** * generateSummary: @@ -146,15 +153,29 @@ const generateFilteredTestcases = (): Testcase[] => { Object.keys(tasksByName).map((testName) => ({ input: { name: testName, modelName: model }, name: testName, - tags: [model, testName], + tags: [ + model, + testName, + ...(tasksConfig.find((t) => t.name === testName)?.categories || []).map( + (x) => `category/${x}`, + ), + ], metadata: { model, test: testName, + categories: tasksConfig.find((t) => t.name === testName)?.categories, }, expected: true, })), ); + // Filter test cases to match default eval categories + allTestcases = allTestcases.filter((testcase) => + DEFAULT_EVAL_CATEGORIES.some((category) => + tasksByName[testcase.name].categories.includes(category), + ), + ); + // Filter by category if a category is specified if (filterByCategory) { allTestcases = allTestcases.filter((testcase) => @@ -178,6 +199,16 @@ const generateFilteredTestcases = (): Testcase[] => { ); } + console.log( + "All test cases:", + allTestcases + .map( + (t, i) => + `${i}: ${t.name} (${t.input.modelName}): ${t.metadata.categories}`, + ) + .join("\n"), + ); + return allTestcases; }; @@ -244,7 +275,37 @@ const generateFilteredTestcases = (): Testcase[] => { }); if (input.modelName.startsWith("gemini")) { llmClient = new AISdkClient({ - model: wrapAISDKModel(google("gemini-2.0-flash")), + model: wrapAISDKModel(google(input.modelName)), + }); + } else if (input.modelName.startsWith("claude")) { + llmClient = new AISdkClient({ + model: wrapAISDKModel(anthropic(input.modelName)), + }); + } else if (input.modelName.includes("groq")) { + llmClient = new AISdkClient({ + model: wrapAISDKModel( + groq( + input.modelName.substring(input.modelName.indexOf("/") + 1), + ), + ), + }); + } else if (input.modelName.includes("cerebras")) { + llmClient = new AISdkClient({ + model: wrapAISDKModel( + cerebras( + input.modelName.substring(input.modelName.indexOf("/") + 1), + ), + ), + }); + } else if (input.modelName.includes("/")) { + llmClient = new CustomOpenAIClient({ + modelName: input.modelName as AvailableModel, + client: wrapOpenAI( + new OpenAI({ + apiKey: process.env.TOGETHER_AI_API_KEY, + baseURL: "https://api.together.xyz/v1", + }), + ), }); } const taskInput = await initStagehand({ @@ -295,6 +356,8 @@ const generateFilteredTestcases = (): Testcase[] => { trialCount: TRIAL_COUNT, }); + if (!process.env.WRITE_FILE) return; + // Map results to the SummaryResult format const summaryResults: SummaryResult[] = evalResult.results.map((result) => { const output = diff --git a/evals/tasks/amazon_add_to_cart.ts b/evals/tasks/amazon_add_to_cart.ts index 6272ae054..d6a9bb2f1 100644 --- a/evals/tasks/amazon_add_to_cart.ts +++ b/evals/tasks/amazon_add_to_cart.ts @@ -6,23 +6,16 @@ export const amazon_add_to_cart: EvalFunction = async ({ sessionUrl, stagehand, }) => { - await stagehand.page.goto( - "https://www.amazon.com/Laptop-MacBook-Surface-Water-Resistant-Accessories/dp/B0D5M4H5CD", - ); - - await stagehand.page.waitForTimeout(5000); - - await stagehand.page.act({ - action: "click the 'Add to Cart' button", - }); - - await stagehand.page.waitForTimeout(2000); + await stagehand.page.goto("https://www.amazon.com"); + await stagehand.page.act("click on the search bar"); + await stagehand.page.act("type 'amazon basics classic lined notebook'"); + await stagehand.page.act("click on the search button"); + await stagehand.page.act("click on the first relevant search result"); + await stagehand.page.act("click the 'Add to Cart' button"); await stagehand.page.act({ action: "click the 'Proceed to checkout' button", }); - - await stagehand.page.waitForTimeout(2000); const currentUrl = stagehand.page.url(); const expectedUrlPrefix = "https://www.amazon.com/ap/signin"; diff --git a/evals/tasks/extract_collaborators.ts b/evals/tasks/extract_collaborators.ts index b0da6f37a..8e10ee020 100644 --- a/evals/tasks/extract_collaborators.ts +++ b/evals/tasks/extract_collaborators.ts @@ -11,18 +11,22 @@ export const extract_collaborators: EvalFunction = async ({ try { await stagehand.page.goto("https://github.com/facebook/react"); await stagehand.page.act({ - action: "find the contributors section", + action: "find and click the contributors section", }); + await stagehand.page.waitForLoadState("domcontentloaded"); + await stagehand.page.waitForLoadState("networkidle"); + await stagehand.page.waitForTimeout(5000); + const { contributors } = await stagehand.page.extract({ - instruction: "Extract top 20 contributors of this repository", + instruction: "Extract top 5 contributors of this repository", schema: z.object({ contributors: z.array( z.object({ github_username: z .string() .describe("the github username of the contributor"), - information: z.string().describe("number of commits contributed"), + commits: z.number().describe("number of commits contributed"), }), ), }), @@ -31,8 +35,20 @@ export const extract_collaborators: EvalFunction = async ({ await stagehand.close(); + const EXPECTED_CONTRIBUTORS = [ + "zpao", + "gaearon", + "sebmarkbage", + "acdlite", + "sophiebits", + ]; return { - _success: contributors.length === 20, + _success: + contributors.length === EXPECTED_CONTRIBUTORS.length && + contributors.every( + (c, i) => + EXPECTED_CONTRIBUTORS[i] === c.github_username && c.commits >= 1000, + ), contributors, debugUrl, sessionUrl, diff --git a/evals/tasks/observe_amazon_add_to_cart.ts b/evals/tasks/observe_amazon_add_to_cart.ts index 39607dbf4..63bac0dbe 100644 --- a/evals/tasks/observe_amazon_add_to_cart.ts +++ b/evals/tasks/observe_amazon_add_to_cart.ts @@ -1,5 +1,4 @@ import { EvalFunction } from "@/types/evals"; -import { performPlaywrightMethod } from "@/lib/a11y/utils"; export const observe_amazon_add_to_cart: EvalFunction = async ({ debugUrl, @@ -24,33 +23,19 @@ export const observe_amazon_add_to_cart: EvalFunction = async ({ // Example of using performPlaywrightMethod if you have the xpath if (observations1.length > 0) { const action1 = observations1[0]; - await performPlaywrightMethod( - stagehand.page, - stagehand.logger, - action1.method, - action1.arguments, - action1.selector.replace("xpath=", ""), - ); + await stagehand.page.act(action1); } await stagehand.page.waitForTimeout(2000); const observations2 = await stagehand.page.observe({ instruction: "Find and click the 'Proceed to checkout' button", - onlyVisible: false, - returnAction: true, }); // Example of using performPlaywrightMethod if you have the xpath if (observations2.length > 0) { const action2 = observations2[0]; - await performPlaywrightMethod( - stagehand.page, - stagehand.logger, - action2.method, - action2.arguments, - action2.selector.replace("xpath=", ""), - ); + await stagehand.page.act(action2); } await stagehand.page.waitForTimeout(2000); diff --git a/evals/tasks/observe_yc_startup.ts b/evals/tasks/observe_yc_startup.ts index 9470e6323..bdc654da5 100644 --- a/evals/tasks/observe_yc_startup.ts +++ b/evals/tasks/observe_yc_startup.ts @@ -11,7 +11,7 @@ export const observe_yc_startup: EvalFunction = async ({ const observations = await stagehand.page.observe({ instruction: - "Find the container element that holds links to each of the startup companies. The companies each have a name, a description, and a link to their website.", + "Click the container element that holds links to each of the startup companies. The companies each have a name, a description, and a link to their website.", }); if (observations.length === 0) { diff --git a/evals/tasks/panamcs.ts b/evals/tasks/panamcs.ts index 5447daa98..2b68d096e 100644 --- a/evals/tasks/panamcs.ts +++ b/evals/tasks/panamcs.ts @@ -10,7 +10,9 @@ export const panamcs: EvalFunction = async ({ "https://browserbase.github.io/stagehand-eval-sites/sites/panamcs/", ); - const observations = await stagehand.page.observe({ onlyVisible: true }); + const observations = await stagehand.page.observe( + "click the 'about us' link", + ); if (observations.length === 0) { await stagehand.close(); @@ -23,7 +25,7 @@ export const panamcs: EvalFunction = async ({ }; } - const expectedLocator = `a.btn:nth-child(3)`; + const expectedLocator = `#menu > li:nth-child(1) > a`; const expectedResult = await stagehand.page .locator(expectedLocator) diff --git a/evals/tasks/shopify_homepage.ts b/evals/tasks/shopify_homepage.ts deleted file mode 100644 index 03bd7f1fe..000000000 --- a/evals/tasks/shopify_homepage.ts +++ /dev/null @@ -1,62 +0,0 @@ -import { EvalFunction } from "@/types/evals"; - -export const shopify_homepage: EvalFunction = async ({ - debugUrl, - sessionUrl, - stagehand, - logger, -}) => { - await stagehand.page.goto("https://www.shopify.com/"); - - const observations = await stagehand.page.observe({ onlyVisible: true }); - - if (observations.length === 0) { - await stagehand.close(); - return { - _success: false, - observations, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; - } - - const expectedLocator = `body > div.relative > header > div > div > div.ml-auto > ul.lg\\:flex.hidden.items-center > li.leading-\\[0\\] > a`; - - const expectedResult = await stagehand.page - .locator(expectedLocator) - .first() - .innerText(); - - let foundMatch = false; - for (const observation of observations) { - try { - const observationResult = await stagehand.page - .locator(observation.selector) - .first() - .innerText(); - - if (observationResult === expectedResult) { - foundMatch = true; - break; - } - } catch (error) { - console.warn( - `Failed to check observation with selector ${observation.selector}:`, - error.message, - ); - continue; - } - } - - await stagehand.close(); - - return { - _success: foundMatch, - expected: expectedResult, - observations, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; -}; diff --git a/evals/tasks/vanta.ts b/evals/tasks/vanta.ts deleted file mode 100644 index 016127392..000000000 --- a/evals/tasks/vanta.ts +++ /dev/null @@ -1,63 +0,0 @@ -import { EvalFunction } from "@/types/evals"; - -export const vanta: EvalFunction = async ({ - debugUrl, - sessionUrl, - stagehand, - logger, -}) => { - await stagehand.page.goto("https://www.vanta.com/"); - await stagehand.page.act({ action: "close the cookies popup" }); - - const observations = await stagehand.page.observe({ onlyVisible: true }); - - if (observations.length === 0) { - await stagehand.close(); - return { - _success: false, - observations, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; - } - - const expectedLocator = `body > div.page-wrapper > div.nav_component > div.nav_element.w-nav > div.padding-global > div > div > nav > div.nav_cta-wrapper.is-new > a.nav_link.is-tablet-margin-0.w-nav-link`; - - const expectedResult = await stagehand.page - .locator(expectedLocator) - .first() - .innerHTML(); - - let foundMatch = false; - for (const observation of observations) { - try { - const observationResult = await stagehand.page - .locator(observation.selector) - .first() - .innerHTML(); - - if (observationResult === expectedResult) { - foundMatch = true; - break; - } - } catch (error) { - console.warn( - `Failed to check observation with selector ${observation.selector}:`, - error.message, - ); - continue; - } - } - - await stagehand.close(); - - return { - _success: foundMatch, - expected: expectedResult, - observations, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; -}; diff --git a/evals/tasks/vanta_h.ts b/evals/tasks/vanta_h.ts index 1eb139002..dba21d303 100644 --- a/evals/tasks/vanta_h.ts +++ b/evals/tasks/vanta_h.ts @@ -8,10 +8,9 @@ export const vanta_h: EvalFunction = async ({ }) => { await stagehand.page.goto("https://www.vanta.com/"); - const observations = await stagehand.page.observe({ - instruction: "find the buy now button if it is available", - onlyVisible: true, - }); + const observations = await stagehand.page.observe( + "click the buy now button if it is available", + ); await stagehand.close(); diff --git a/package-lock.json b/package-lock.json index f9404ff6e..bcb38bd37 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,8 +15,12 @@ "zod-to-json-schema": "^3.23.5" }, "devDependencies": { + "@ai-sdk/anthropic": "^1.2.6", + "@ai-sdk/cerebras": "^0.2.6", "@ai-sdk/google": "^1.2.6", + "@ai-sdk/groq": "^1.2.4", "@ai-sdk/openai": "^1.0.14", + "@ai-sdk/togetherai": "^0.2.6", "@changesets/changelog-github": "^0.5.0", "@changesets/cli": "^2.27.9", "@eslint/js": "^9.16.0", @@ -56,6 +60,103 @@ "zod": "^3.23.8" } }, + "node_modules/@ai-sdk/anthropic": { + "version": "1.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-1.2.6.tgz", + "integrity": "sha512-Mt8ZSkhwnKHfwPPIviv3xgRE/nch2Mu4Fdh7oJDJvPDRJ6tNidCJd3TMwdlrlzPskF7hxCmXmd36yBgZZgt4cA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, + "node_modules/@ai-sdk/cerebras": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/cerebras/-/cerebras-0.2.6.tgz", + "integrity": "sha512-XpVqq5462HyQPT55Ptpdb0pwti3fbLOZGDeWgxkSwJTzC+fCzDLZFLJKDCINhiMzD+8CAQPQ/qm9+inFZaF0Og==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/openai-compatible": "0.2.6", + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/cerebras/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/cerebras/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ai-sdk/google": { "version": "1.2.6", "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-1.2.6.tgz", @@ -104,6 +205,54 @@ "zod": "^3.23.8" } }, + "node_modules/@ai-sdk/groq": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/groq/-/groq-1.2.4.tgz", + "integrity": "sha512-jeO/tO8lGpk7L/zpPPSeZ6tMYwcXq2LbPEgmvTDhdrSj4AwYhL9WiEmZerixsKzNuOxjAzP0QZOns1SpsGaC2A==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/groq/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/groq/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ai-sdk/openai": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.2.3.tgz", @@ -121,6 +270,54 @@ "zod": "^3.0.0" } }, + "node_modules/@ai-sdk/openai-compatible": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai-compatible/-/openai-compatible-0.2.6.tgz", + "integrity": "sha512-UOPEWIqG3l5K9O+p7gqiCOWzx66JtmG9v9Mab+S4E7WE34EN6u1QS1pX+RDlRDhZ0/8gNJif0r4Xlc+Ti03yNA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/openai-compatible/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/openai-compatible/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ai-sdk/provider": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.0.10.tgz", @@ -403,6 +600,55 @@ "node": ">=14.18" } }, + "node_modules/@ai-sdk/togetherai": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/togetherai/-/togetherai-0.2.6.tgz", + "integrity": "sha512-AV3CABMKlIniCe5owr6H/kSirfk3Y/MeBAetrNJxRDhmrxa5VXzbWeMxS5xeS8crqFXWJPPLcwPiwOuFtpQMrA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/openai-compatible": "0.2.6", + "@ai-sdk/provider": "1.1.0", + "@ai-sdk/provider-utils": "2.2.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/togetherai/node_modules/@ai-sdk/provider": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.0.tgz", + "integrity": "sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/togetherai/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.4.tgz", + "integrity": "sha512-13sEGBxB6kgaMPGOgCLYibF6r8iv8mgjhuToFrOTU09bBxbFQd8ZoARarCfJN6VomCUbUvMKwjTBLb1vQnN+WA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.0", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ai-sdk/ui-utils": { "version": "1.2.5", "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.5.tgz", diff --git a/package.json b/package.json index a4b1c5c0d..6febf054d 100644 --- a/package.json +++ b/package.json @@ -44,8 +44,12 @@ "author": "Browserbase", "license": "MIT", "devDependencies": { + "@ai-sdk/anthropic": "^1.2.6", + "@ai-sdk/cerebras": "^0.2.6", "@ai-sdk/google": "^1.2.6", + "@ai-sdk/groq": "^1.2.4", "@ai-sdk/openai": "^1.0.14", + "@ai-sdk/togetherai": "^0.2.6", "@changesets/changelog-github": "^0.5.0", "@changesets/cli": "^2.27.9", "@eslint/js": "^9.16.0", From 9a23a757ecc98d8d0d99319a64f657c7cac6aef5 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Thu, 10 Apr 2025 09:12:44 -0700 Subject: [PATCH 11/25] all eval tasks --- evals/args.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/evals/args.ts b/evals/args.ts index 2b990d269..fe3d762e2 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -63,14 +63,14 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES "act", "combination", "extract", - // "experimental", - // "text_extract", - // "targeted_extract", - // "regression_llm_providers", - // "regression_text_extract", - // "regression_dom_extract", - // "llm_clients", - // "unit", + "experimental", + "text_extract", + "targeted_extract", + "regression_llm_providers", + "regression_text_extract", + "regression_dom_extract", + "llm_clients", + "unit", ]; // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name From 78a21b1835318e17f4c2fd9f6a130ebdee8d41c6 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Thu, 10 Apr 2025 09:29:44 -0700 Subject: [PATCH 12/25] address comments and remove hn from ci --- .github/workflows/ci.yml | 61 ++-------------------------------------- evals/initStagehand.ts | 10 +------ types/evals.ts | 13 ++++++++- 3 files changed, 15 insertions(+), 69 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c2a6c56c..480ccf16f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -186,57 +186,6 @@ jobs: - name: Run E2E Tests (browserbase) run: npm run e2e:bb - run-regression-evals-llm-providers: - needs: - [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] - runs-on: ubuntu-latest - timeout-minutes: 9 - outputs: - regression_llm_providers_score: ${{ steps.set-llm-providers-score.outputs.regression_llm_providers_score }} - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} - BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} - BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} - HEADLESS: true - EVAL_ENV: browserbase - EVAL_MODELS: "gpt-4o-mini" - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: "20" - - - name: Install dependencies - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Build Stagehand - run: npm run build - - - name: Install Playwright browsers - run: npm exec playwright install --with-deps - - - name: Run Regression Evals (llmProviders) - run: npm run evals category regression_llm_providers trials=2 concurrency=8 env=BROWSERBASE - - - name: Save Regression llmProviders Results - run: mv eval-summary.json eval-summary-regression-llm-providers.json - - - name: Log and Regression (llmProviders) Evals Performance - id: set-llm-providers-score - run: | - experimentNameRegressionLlmProviders=$(jq -r '.experimentName' eval-summary-regression-llm-providers.json) - regression_llm_providers_score=$(jq '.categories.regression_llm_providers' eval-summary-regression-llm-providers.json) - echo "regression_llm_providers category score: ${regression_llm_providers_score}%" - echo "View regression_llm_providers results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionLlmProviders}" - echo "regression_llm_providers_score=$regression_llm_providers_score" >> "$GITHUB_OUTPUT" - run-regression-evals-dom-extract: needs: [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] @@ -340,12 +289,7 @@ jobs: echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT" check-regression-evals-score: - needs: - [ - run-regression-evals-text-extract, - run-regression-evals-dom-extract, - run-regression-evals-llm-providers, - ] + needs: [run-regression-evals-text-extract, run-regression-evals-dom-extract] runs-on: ubuntu-latest timeout-minutes: 5 steps: @@ -353,9 +297,8 @@ jobs: run: | regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}" regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}" - regression_llm_providers_score="${{ needs.run-regression-evals-llm-providers.outputs.regression_llm_providers_score }}" - overall_score=$(echo "(${regression_dom_score} + ${regression_text_score} + ${regression_llm_providers_score}) / 3" | bc -l) + overall_score=$(echo "(${regression_dom_score} + ${regression_text_score}) / 2" | bc -l) echo "Overall regression score: ${overall_score}%" # Fail if overall score is below 90% diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts index ab9c16856..174778aea 100644 --- a/evals/initStagehand.ts +++ b/evals/initStagehand.ts @@ -13,15 +13,7 @@ import { enableCaching, env } from "./env"; import { ConstructorParams, LLMClient, Stagehand } from "@/dist"; import { EvalLogger } from "./logger"; - -export type StagehandInitResult = { - stagehand: Stagehand; - logger: EvalLogger; - debugUrl: string; - sessionUrl: string; - useTextExtract: boolean; - stagehandConfig: ConstructorParams; -}; +import type { StagehandInitResult } from "@/types/evals"; /** * StagehandConfig: diff --git a/types/evals.ts b/types/evals.ts index efb8c0da5..ef4aba8f2 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -2,7 +2,18 @@ import { z } from "zod"; import type { AvailableModel } from "../types/model"; import type { LogLine } from "../types/log"; import type { EvalCase } from "braintrust"; -import { StagehandInitResult } from "@/evals/initStagehand"; +import { Stagehand } from "@/dist"; +import { ConstructorParams } from "@/dist"; +import { EvalLogger } from "@/evals/logger"; + +export type StagehandInitResult = { + stagehand: Stagehand; + logger: EvalLogger; + debugUrl: string; + sessionUrl: string; + useTextExtract: boolean; + stagehandConfig: ConstructorParams; +}; export type EvalFunction = (taskInput: StagehandInitResult) => Promise<{ _success: boolean; From e2c0df33ef7a7f95122436818327a1efb3a36df8 Mon Sep 17 00:00:00 2001 From: Anirudh Kamath Date: Thu, 10 Apr 2025 11:14:48 -0700 Subject: [PATCH 13/25] press enter --- evals/tasks/allrecipes.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/tasks/allrecipes.ts b/evals/tasks/allrecipes.ts index 9d2b193cf..e1b8a0c50 100644 --- a/evals/tasks/allrecipes.ts +++ b/evals/tasks/allrecipes.ts @@ -16,7 +16,7 @@ export const allrecipes: EvalFunction = async ({ action: 'Type "chocolate chip cookies" in the search bar', }); await stagehand.page.act({ - action: "click the search icon button", + action: "press enter", }); const recipeDetails = await stagehand.page.extract({ From 879d3e95bbfc0a857b9c6794cab681c325acf1c2 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 12:36:08 -0700 Subject: [PATCH 14/25] dont use braintrust ai proxy --- evals/index.eval.ts | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/evals/index.eval.ts b/evals/index.eval.ts index fb29d7536..995c9e030 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -264,16 +264,17 @@ const generateFilteredTestcases = (): Testcase[] => { } // Execute the task - let llmClient: LLMClient = new CustomOpenAIClient({ - modelName: input.modelName as AvailableModel, - client: wrapOpenAI( - new OpenAI({ - apiKey: process.env.BRAINTRUST_API_KEY, - baseURL: "https://api.braintrust.dev/v1/proxy", - }), - ), - }); - if (input.modelName.startsWith("gemini")) { + let llmClient: LLMClient; + if (input.modelName.startsWith("gpt")) { + llmClient = new CustomOpenAIClient({ + modelName: input.modelName as AvailableModel, + client: wrapOpenAI( + new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }), + ), + }); + } else if (input.modelName.startsWith("gemini")) { llmClient = new AISdkClient({ model: wrapAISDKModel(google(input.modelName)), }); From 7e71bf01d30afff8f5ded4fccf123eb82e380d1c Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 12:36:37 -0700 Subject: [PATCH 15/25] fix amazon eval --- evals/tasks/amazon_add_to_cart.ts | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/evals/tasks/amazon_add_to_cart.ts b/evals/tasks/amazon_add_to_cart.ts index d6a9bb2f1..64c410f8f 100644 --- a/evals/tasks/amazon_add_to_cart.ts +++ b/evals/tasks/amazon_add_to_cart.ts @@ -6,21 +6,31 @@ export const amazon_add_to_cart: EvalFunction = async ({ sessionUrl, stagehand, }) => { - await stagehand.page.goto("https://www.amazon.com"); + await stagehand.page.goto( + "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/", + ); + + await stagehand.page.waitForTimeout(5000); + + await stagehand.page.act({ + action: "click the 'Add to Cart' button", + }); + + await stagehand.page.waitForTimeout(2000); - await stagehand.page.act("click on the search bar"); - await stagehand.page.act("type 'amazon basics classic lined notebook'"); - await stagehand.page.act("click on the search button"); - await stagehand.page.act("click on the first relevant search result"); - await stagehand.page.act("click the 'Add to Cart' button"); await stagehand.page.act({ action: "click the 'Proceed to checkout' button", }); + + await stagehand.page.waitForTimeout(2000); const currentUrl = stagehand.page.url(); - const expectedUrlPrefix = "https://www.amazon.com/ap/signin"; + const expectedUrl = + "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/sign-in.html"; + + await stagehand.close(); return { - _success: currentUrl.startsWith(expectedUrlPrefix), + _success: currentUrl === expectedUrl, currentUrl, debugUrl, sessionUrl, From 0b09e13a98c5782f6b75a56c20696040ecbe6d94 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 12:54:59 -0700 Subject: [PATCH 16/25] remove WRITE_FILE check --- evals/index.eval.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 995c9e030..8c00f9691 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -357,8 +357,6 @@ const generateFilteredTestcases = (): Testcase[] => { trialCount: TRIAL_COUNT, }); - if (!process.env.WRITE_FILE) return; - // Map results to the SummaryResult format const summaryResults: SummaryResult[] = evalResult.results.map((result) => { const output = From e67606e300f620c0840dbfe99500979d807fb839 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 13:30:30 -0700 Subject: [PATCH 17/25] revert amazon to act category --- evals/evals.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/evals.config.json b/evals/evals.config.json index 1936cb69d..c15222a0d 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -14,7 +14,7 @@ }, { "name": "amazon_add_to_cart", - "categories": ["experimental"] + "categories": ["act"] }, { "name": "instructions", From 5cab2d5d2674500c97638c6e99bcc610a76353f7 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 14:09:58 -0700 Subject: [PATCH 18/25] unify regression evals --- evals/args.ts | 4 +--- evals/evals.config.json | 39 +++++++++++++++++++++++---------------- evals/index.eval.ts | 11 ++++++++++- evals/taskConfig.ts | 13 ++++++++++--- types/evals.ts | 4 +--- 5 files changed, 45 insertions(+), 26 deletions(-) diff --git a/evals/args.ts b/evals/args.ts index fe3d762e2..4c2f748fc 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -67,10 +67,8 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES "text_extract", "targeted_extract", "regression_llm_providers", - "regression_text_extract", - "regression_dom_extract", + "regression", "llm_clients", - "unit", ]; // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name diff --git a/evals/evals.config.json b/evals/evals.config.json index c15222a0d..9630a86d7 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -6,7 +6,7 @@ }, { "name": "expect_act_timeout", - "categories": ["unit"] + "categories": ["regression"] }, { "name": "extract_repo_name", @@ -18,7 +18,7 @@ }, { "name": "instructions", - "categories": ["unit", "combination"] + "categories": ["regression", "combination"] }, { "name": "bidnet", @@ -26,7 +26,8 @@ }, { "name": "ionwave", - "categories": ["act", "regression_dom_extract"] + "categories": ["act", "regression"], + "extract_method": "domExtract" }, { "name": "nonsense_action", @@ -71,7 +72,7 @@ }, { "name": "peeler_complex", - "categories": ["unit", "combination"] + "categories": ["combination"] }, { "name": "sciquest", @@ -79,7 +80,8 @@ }, { "name": "wichita", - "categories": ["combination", "regression_dom_extract"] + "categories": ["combination", "regression"], + "extract_method": "domExtract" }, { "name": "hn_aisdk", @@ -115,7 +117,8 @@ }, { "name": "extract_aigrant_companies", - "categories": ["experimental", "text_extract", "regression_text_extract"] + "categories": ["text_extract", "regression"], + "extract_method": "textExtract" }, { "name": "extract_capacitor_info", @@ -164,7 +167,8 @@ }, { "name": "extract_memorial_healthcare", - "categories": ["extract", "regression_dom_extract"] + "categories": ["extract", "regression"], + "extract_method": "domExtract" }, { "name": "extract_nhl_stats", @@ -225,11 +229,13 @@ }, { "name": "observe_github", - "categories": ["observe", "regression_text_extract"] + "categories": ["observe", "regression"], + "extract_method": "textExtract" }, { "name": "observe_vantechjournal", - "categories": ["observe", "regression_text_extract"] + "categories": ["observe", "regression"], + "extract_method": "textExtract" }, { "name": "observe_amazon_add_to_cart", @@ -249,15 +255,16 @@ }, { "name": "observe_iframes1", - "categories": ["unit", "observe"] + "categories": ["regression", "observe"] }, { "name": "observe_iframes2", - "categories": ["unit", "observe"] + "categories": ["regression", "observe"] }, { "name": "extract_hamilton_weather", - "categories": ["targeted_extract", "regression_text_extract"] + "categories": ["targeted_extract", "regression"], + "extract_method": "textExtract" }, { "name": "extract_regulations_table", @@ -285,19 +292,19 @@ }, { "name": "scroll_50", - "categories": ["unit", "act"] + "categories": ["regression", "act"] }, { "name": "scroll_75", - "categories": ["unit", "act"] + "categories": ["regression", "act"] }, { "name": "nextChunk", - "categories": ["unit", "act"] + "categories": ["regression", "act"] }, { "name": "prevChunk", - "categories": ["unit", "act"] + "categories": ["regression", "act"] } ] } diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 8c00f9691..617214e1e 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -262,6 +262,15 @@ const generateFilteredTestcases = (): Testcase[] => { `No Eval function found for task name: ${input.name}`, ); } + let shouldUseTextExtract = useTextExtract; + const categories = tasksByName[input.name].categories || []; + const isRegression = categories.includes("regression"); + const regressionExtractMethod = tasksByName[input.name].extractMethod; + if (isRegression) { + if (regressionExtractMethod) { + shouldUseTextExtract = regressionExtractMethod === "textExtract"; + } + } // Execute the task let llmClient: LLMClient; @@ -312,7 +321,7 @@ const generateFilteredTestcases = (): Testcase[] => { const taskInput = await initStagehand({ logger, llmClient, - useTextExtract, + useTextExtract: shouldUseTextExtract, }); let result; try { diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index ac54ae506..e5ea48b02 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -63,13 +63,20 @@ const config = JSON.parse(fs.readFileSync(configPath, "utf-8")) satisfies { * The `tasksConfig` defines all tasks from the config file. Each task has a name and categories. * We create a mapping `tasksByName` from task name to its categories for quick lookup. */ -type TaskConfig = { name: string; categories: string[] }; +type TaskConfig = { + name: string; + categories: string[]; + extract_method?: string; +}; const tasksConfig = config.tasks as TaskConfig[]; const tasksByName = tasksConfig.reduce< - Record + Record >((acc, task) => { - acc[task.name] = { categories: task.categories }; + acc[task.name] = { + categories: task.categories, + extractMethod: task.extract_method, + }; return acc; }, {}); diff --git a/types/evals.ts b/types/evals.ts index ef4aba8f2..dc672550b 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -31,11 +31,9 @@ export const EvalCategorySchema = z.enum([ "experimental", "text_extract", "targeted_extract", - "regression_text_extract", - "regression_dom_extract", + "regression", "regression_llm_providers", "llm_clients", - "unit", ]); export type EvalCategory = z.infer; From 591b64d11a4f8896a012b476221662350c71f7a4 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 14:20:01 -0700 Subject: [PATCH 19/25] update CI --- .github/workflows/ci.yml | 98 +++++++--------------------------------- 1 file changed, 16 insertions(+), 82 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 480ccf16f..7b422e592 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -186,13 +186,13 @@ jobs: - name: Run E2E Tests (browserbase) run: npm run e2e:bb - run-regression-evals-dom-extract: + run-regression-evals: needs: [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] runs-on: ubuntu-latest timeout-minutes: 9 outputs: - regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }} + regression_score: ${{ steps.set-regression-score.outputs.regression_score }} env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} @@ -223,92 +223,26 @@ jobs: run: npm exec playwright install --with-deps - name: Run Regression Evals (domExtract) - run: npm run evals category regression_dom_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=domExtract + run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE - - name: Save Regression domExtract Results - run: mv eval-summary.json eval-summary-regression-dom.json - - - name: Log and Regression (domExtract) Evals Performance - id: set-dom-score - run: | - experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json) - regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json) - echo "regression_dom_extract category score: ${regression_dom_score}%" - echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}" - echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT" - - run-regression-evals-text-extract: - needs: - [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] - runs-on: ubuntu-latest - timeout-minutes: 9 - outputs: - regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }} - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} - BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} - BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} - HEADLESS: true - EVAL_ENV: browserbase - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: "20" - - - name: Install dependencies + - name: Log Regression Evals Performance run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Build Stagehand - run: npm run build - - - name: Install Playwright browsers - run: npm exec playwright install --with-deps - - - name: Run Regression Evals (textExtract) - run: npm run evals category regression_text_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=textExtract - - - name: Save Regression textExtract Results - run: mv eval-summary.json eval-summary-regression-text.json - - - name: Log Regression (textExtract) Evals Performance - id: set-text-score - run: | - experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json) - regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json) - echo "regression_text_extract category score: ${regression_text_score}%" - echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}" - echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT" - - check-regression-evals-score: - needs: [run-regression-evals-text-extract, run-regression-evals-dom-extract] - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - name: Compare Overall Regression Evals Score - run: | - regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}" - regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}" - - overall_score=$(echo "(${regression_dom_score} + ${regression_text_score}) / 2" | bc -l) - echo "Overall regression score: ${overall_score}%" - - # Fail if overall score is below 90% - if (( $(echo "${overall_score} < 90" | bc -l) )); then - echo "Overall regression score is below 90%. Failing CI." + experimentName=$(jq -r '.experimentName' eval-summary.json) + echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" + if [ -f eval-summary.json ]; then + regression_score=$(jq '.categories.regression' eval-summary.json) + echo "Regression category score: regression_score%" + if (( $(echo "regression_score < 90" | bc -l) )); then + echo "Regression category score is below 90%. Failing CI." + exit 1 + fi + else + echo "Eval summary not found for regression category. Failing CI." exit 1 fi run-combination-evals: - needs: [check-regression-evals-score, determine-evals] + needs: [run-regression-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 40 env: From d25a6cb4f5c6a3d1320755460fa82976ddaf127f Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 14:26:27 -0700 Subject: [PATCH 20/25] fix job naming --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b422e592..90af4818a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -222,7 +222,7 @@ jobs: - name: Install Playwright browsers run: npm exec playwright install --with-deps - - name: Run Regression Evals (domExtract) + - name: Run Regression Evals run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE - name: Log Regression Evals Performance From 6f8ecb76e8c45fd5ea4de53602e85ad182b398af Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 15:20:02 -0700 Subject: [PATCH 21/25] wrap in try catch --- evals/tasks/observe_iframes2.ts | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/evals/tasks/observe_iframes2.ts b/evals/tasks/observe_iframes2.ts index 3934449ad..e02ebc7ab 100644 --- a/evals/tasks/observe_iframes2.ts +++ b/evals/tasks/observe_iframes2.ts @@ -1,5 +1,6 @@ import { Stagehand } from "@/dist"; import { EvalFunction } from "@/types/evals"; +import { ObserveResult } from "@/types/stagehand"; export const observe_iframes2: EvalFunction = async ({ logger, @@ -17,9 +18,21 @@ export const observe_iframes2: EvalFunction = async ({ ); await new Promise((resolve) => setTimeout(resolve, 5000)); - const observations = await stagehand.page.observe({ - instruction: "find the main header of the page", - }); + let observations: ObserveResult[]; + try { + observations = await stagehand.page.observe({ + instruction: "find the main header of the page", + }); + } catch (err) { + await stagehand.close(); + return { + _success: false, + message: err.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } if (observations.length === 0) { await stagehand.close(); From f5aef1c6bf47f63902779de07b6f142b86acc6aa Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 15:57:44 -0700 Subject: [PATCH 22/25] fix yml --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 90af4818a..6993c31c3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -231,8 +231,8 @@ jobs: echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then regression_score=$(jq '.categories.regression' eval-summary.json) - echo "Regression category score: regression_score%" - if (( $(echo "regression_score < 90" | bc -l) )); then + echo "Regression category score: $regression_score%" + if (( $(echo "$regression_score < 90" | bc -l) )); then echo "Regression category score is below 90%. Failing CI." exit 1 fi From 426116dad9f8b8137e8f4b7843449986b27c0763 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 17:47:52 -0700 Subject: [PATCH 23/25] update other amazon eval --- evals/tasks/observe_amazon_add_to_cart.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evals/tasks/observe_amazon_add_to_cart.ts b/evals/tasks/observe_amazon_add_to_cart.ts index 63bac0dbe..a9b810380 100644 --- a/evals/tasks/observe_amazon_add_to_cart.ts +++ b/evals/tasks/observe_amazon_add_to_cart.ts @@ -7,7 +7,7 @@ export const observe_amazon_add_to_cart: EvalFunction = async ({ logger, }) => { await stagehand.page.goto( - "https://www.amazon.com/Laptop-MacBook-Surface-Water-Resistant-Accessories/dp/B0D5M4H5CD", + "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/", ); await stagehand.page.waitForTimeout(5000); @@ -40,7 +40,8 @@ export const observe_amazon_add_to_cart: EvalFunction = async ({ await stagehand.page.waitForTimeout(2000); const currentUrl = stagehand.page.url(); - const expectedUrlPrefix = "https://www.amazon.com/ap/signin"; + const expectedUrlPrefix = + "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/sign-in.html"; await stagehand.close(); From 993b8ca0ac82f3cf018812b489af938b2b300dcd Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 17:53:47 -0700 Subject: [PATCH 24/25] vanta_h experimental --- evals/evals.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/evals.config.json b/evals/evals.config.json index 9630a86d7..6917905ac 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -205,7 +205,7 @@ }, { "name": "vanta_h", - "categories": ["observe"] + "categories": ["experimental"] }, { "name": "extract_area_codes", From 142a8af27d7b8cb15c2a42cbcff0792c81879822 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 10 Apr 2025 20:51:37 -0700 Subject: [PATCH 25/25] add text_extract eval category to CI --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6993c31c3..66fdc2dd8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,7 @@ on: env: EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" - EVAL_CATEGORIES: "observe,act,combination,extract,text_extract" + EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract" concurrency: group: ${{ github.ref }}