Skip to content

wrap braintrust to get llm usage data #637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/stupid-ghosts-smash.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Fix: forward along the stack trace in StagehandDefaultError
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now it's throwing the error multiple times, need a fast follow PR to only throw StagehandDefaultError once

159 changes: 18 additions & 141 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:

env:
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract"
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract"

concurrency:
group: ${{ github.ref }}
Expand Down Expand Up @@ -186,64 +186,13 @@ jobs:
- name: Run E2E Tests (browserbase)
run: npm run e2e:bb

run-regression-evals-llm-providers:
run-regression-evals:
needs:
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 9
outputs:
regression_llm_providers_score: ${{ steps.set-llm-providers-score.outputs.regression_llm_providers_score }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
EVAL_MODELS: "gpt-4o-mini"
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
run: |
rm -rf node_modules
rm -f package-lock.json
npm install

- name: Build Stagehand
run: npm run build

- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Regression Evals (llmProviders)
run: npm run evals category regression_llm_providers trials=2 concurrency=8 env=BROWSERBASE

- name: Save Regression llmProviders Results
run: mv eval-summary.json eval-summary-regression-llm-providers.json

- name: Log and Regression (llmProviders) Evals Performance
id: set-llm-providers-score
run: |
experimentNameRegressionLlmProviders=$(jq -r '.experimentName' eval-summary-regression-llm-providers.json)
regression_llm_providers_score=$(jq '.categories.regression_llm_providers' eval-summary-regression-llm-providers.json)
echo "regression_llm_providers category score: ${regression_llm_providers_score}%"
echo "View regression_llm_providers results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionLlmProviders}"
echo "regression_llm_providers_score=$regression_llm_providers_score" >> "$GITHUB_OUTPUT"

run-regression-evals-dom-extract:
needs:
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 9
outputs:
regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }}
regression_score: ${{ steps.set-regression-score.outputs.regression_score }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
Expand Down Expand Up @@ -273,99 +222,27 @@ jobs:
- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Regression Evals (domExtract)
run: npm run evals category regression_dom_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=domExtract
- name: Run Regression Evals
run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE

- name: Save Regression domExtract Results
run: mv eval-summary.json eval-summary-regression-dom.json

- name: Log and Regression (domExtract) Evals Performance
id: set-dom-score
- name: Log Regression Evals Performance
run: |
experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json)
regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json)
echo "regression_dom_extract category score: ${regression_dom_score}%"
echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}"
echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT"

run-regression-evals-text-extract:
needs:
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 9
outputs:
regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
run: |
rm -rf node_modules
rm -f package-lock.json
npm install

- name: Build Stagehand
run: npm run build

- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Regression Evals (textExtract)
run: npm run evals category regression_text_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=textExtract

- name: Save Regression textExtract Results
run: mv eval-summary.json eval-summary-regression-text.json

- name: Log Regression (textExtract) Evals Performance
id: set-text-score
run: |
experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json)
regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json)
echo "regression_text_extract category score: ${regression_text_score}%"
echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}"
echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT"

check-regression-evals-score:
needs:
[
run-regression-evals-text-extract,
run-regression-evals-dom-extract,
run-regression-evals-llm-providers,
]
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Compare Overall Regression Evals Score
run: |
regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}"
regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}"
regression_llm_providers_score="${{ needs.run-regression-evals-llm-providers.outputs.regression_llm_providers_score }}"

overall_score=$(echo "(${regression_dom_score} + ${regression_text_score} + ${regression_llm_providers_score}) / 3" | bc -l)
echo "Overall regression score: ${overall_score}%"

# Fail if overall score is below 90%
if (( $(echo "${overall_score} < 90" | bc -l) )); then
echo "Overall regression score is below 90%. Failing CI."
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
regression_score=$(jq '.categories.regression' eval-summary.json)
echo "Regression category score: $regression_score%"
if (( $(echo "$regression_score < 90" | bc -l) )); then
echo "Regression category score is below 90%. Failing CI."
exit 1
fi
else
echo "Eval summary not found for regression category. Failing CI."
exit 1
fi

run-combination-evals:
needs: [check-regression-evals-score, determine-evals]
needs: [run-regression-evals, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 40
env:
Expand Down
12 changes: 10 additions & 2 deletions evals/args.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const parsedArgs: {
trials?: number;
concurrency?: number;
extractMethod?: string;
provider?: string;
leftover: string[];
} = {
leftover: [],
Expand All @@ -28,6 +29,8 @@ for (const arg of rawArgs) {
}
} else if (arg.startsWith("--extract-method=")) {
parsedArgs.extractMethod = arg.split("=")[1];
} else if (arg.startsWith("provider=")) {
parsedArgs.provider = arg.split("=")[1]?.toLowerCase();
} else {
parsedArgs.leftover.push(arg);
}
Expand Down Expand Up @@ -64,8 +67,8 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
"text_extract",
"targeted_extract",
"regression_llm_providers",
"regression_text_extract",
"regression_dom_extract",
"regression",
"llm_clients",
];

// Finally, interpret leftover arguments to see if user typed "category X" or a single eval name
Expand Down Expand Up @@ -93,10 +96,15 @@ if (parsedArgs.leftover.length > 0) {
}
}

if (parsedArgs.provider !== undefined) {
process.env.EVAL_PROVIDER = parsedArgs.provider;
}

export {
filterByCategory,
filterByEvalName,
useTextExtract,
useAccessibilityTree,
DEFAULT_EVAL_CATEGORIES,
parsedArgs,
};
57 changes: 26 additions & 31 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
},
{
"name": "expect_act_timeout",
"categories": ["act"]
},
{
"name": "expect_act_timeout_global",
"categories": ["act"]
"categories": ["regression"]
},
{
"name": "extract_repo_name",
Expand All @@ -22,15 +18,16 @@
},
{
"name": "instructions",
"categories": ["combination"]
"categories": ["regression", "combination"]
},
{
"name": "bidnet",
"categories": ["act"]
},
{
"name": "ionwave",
"categories": ["act", "regression_dom_extract"]
"categories": ["act", "regression"],
"extract_method": "domExtract"
},
{
"name": "nonsense_action",
Expand Down Expand Up @@ -83,19 +80,20 @@
},
{
"name": "wichita",
"categories": ["combination", "regression_dom_extract"]
"categories": ["combination", "regression"],
"extract_method": "domExtract"
},
{
"name": "hn_aisdk",
"categories": ["regression_llm_providers"]
"categories": ["llm_clients"]
},
{
"name": "hn_langchain",
"categories": ["regression_llm_providers"]
"categories": ["llm_clients"]
},
{
"name": "hn_customOpenAI",
"categories": ["regression_llm_providers"]
"categories": ["llm_clients"]
},
{
"name": "apple",
Expand All @@ -119,7 +117,8 @@
},
{
"name": "extract_aigrant_companies",
"categories": ["experimental", "text_extract", "regression_text_extract"]
"categories": ["text_extract", "regression"],
"extract_method": "textExtract"
},
{
"name": "extract_capacitor_info",
Expand Down Expand Up @@ -168,7 +167,8 @@
},
{
"name": "extract_memorial_healthcare",
"categories": ["extract", "regression_dom_extract"]
"categories": ["extract", "regression"],
"extract_method": "domExtract"
},
{
"name": "extract_nhl_stats",
Expand Down Expand Up @@ -203,17 +203,9 @@
"name": "panamcs",
"categories": ["observe"]
},
{
"name": "shopify_homepage",
"categories": ["observe"]
},
{
"name": "vanta",
"categories": ["observe"]
},
{
"name": "vanta_h",
"categories": ["observe"]
"categories": ["experimental"]
},
{
"name": "extract_area_codes",
Expand All @@ -237,11 +229,13 @@
},
{
"name": "observe_github",
"categories": ["observe", "regression_text_extract"]
"categories": ["observe", "regression"],
"extract_method": "textExtract"
},
{
"name": "observe_vantechjournal",
"categories": ["observe", "regression_text_extract"]
"categories": ["observe", "regression"],
"extract_method": "textExtract"
},
{
"name": "observe_amazon_add_to_cart",
Expand All @@ -261,15 +255,16 @@
},
{
"name": "observe_iframes1",
"categories": ["observe"]
"categories": ["regression", "observe"]
},
{
"name": "observe_iframes2",
"categories": ["observe"]
"categories": ["regression", "observe"]
},
{
"name": "extract_hamilton_weather",
"categories": ["targeted_extract", "regression_text_extract"]
"categories": ["targeted_extract", "regression"],
"extract_method": "textExtract"
},
{
"name": "extract_regulations_table",
Expand Down Expand Up @@ -297,19 +292,19 @@
},
{
"name": "scroll_50",
"categories": ["act"]
"categories": ["regression", "act"]
},
{
"name": "scroll_75",
"categories": ["act", "regression_dom_extract"]
"categories": ["regression", "act"]
},
{
"name": "nextChunk",
"categories": ["act"]
"categories": ["regression", "act"]
},
{
"name": "prevChunk",
"categories": ["act"]
"categories": ["regression", "act"]
}
]
}
Loading
Loading