Skip to content

Commit 96fd035

Browse files
regression evals (#583)
* regressions * combination needs regression * separate regression jobs run in parallel * fix experiment names, fix gh output * set scores as gh outputs * add label check for combo * rm needs label guard for combos * 2 trials per
1 parent 4fdbf63 commit 96fd035

File tree

3 files changed

+152
-10
lines changed

3 files changed

+152
-10
lines changed

.github/workflows/ci.yml

Lines changed: 142 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ jobs:
2020
determine-evals:
2121
runs-on: ubuntu-latest
2222
outputs:
23+
run-combination: ${{ steps.check-labels.outputs.run-combination }}
2324
run-extract: ${{ steps.check-labels.outputs.run-extract }}
2425
run-act: ${{ steps.check-labels.outputs.run-act }}
2526
run-observe: ${{ steps.check-labels.outputs.run-observe }}
@@ -31,6 +32,7 @@ jobs:
3132
# Default to running all tests on main branch
3233
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
3334
echo "Running all tests for main branch"
35+
echo "run-combination=true" >> $GITHUB_OUTPUT
3436
echo "run-extract=true" >> $GITHUB_OUTPUT
3537
echo "run-act=true" >> $GITHUB_OUTPUT
3638
echo "run-observe=true" >> $GITHUB_OUTPUT
@@ -40,6 +42,7 @@ jobs:
4042
fi
4143
4244
# Check for specific labels
45+
echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT
4346
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT
4447
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
4548
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
@@ -147,7 +150,7 @@ jobs:
147150
run: npm run e2e:local
148151

149152
run-e2e-bb-tests:
150-
needs: [run-e2e-tests]
153+
needs: [run-lint, run-build]
151154
runs-on: ubuntu-latest
152155
timeout-minutes: 50
153156
if: >
@@ -183,8 +186,129 @@ jobs:
183186
- name: Run E2E Tests (browserbase)
184187
run: npm run e2e:bb
185188

189+
run-regression-evals-dom-extract:
190+
needs:
191+
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
192+
runs-on: ubuntu-latest
193+
timeout-minutes: 7
194+
outputs:
195+
regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }}
196+
env:
197+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
198+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
199+
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
200+
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
201+
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
202+
HEADLESS: true
203+
EVAL_ENV: browserbase
204+
steps:
205+
- name: Check out repository code
206+
uses: actions/checkout@v4
207+
208+
- name: Set up Node.js
209+
uses: actions/setup-node@v4
210+
with:
211+
node-version: "20"
212+
213+
- name: Install dependencies
214+
run: |
215+
rm -rf node_modules
216+
rm -f package-lock.json
217+
npm install
218+
219+
- name: Build Stagehand
220+
run: npm run build
221+
222+
- name: Install Playwright browsers
223+
run: npm exec playwright install --with-deps
224+
225+
- name: Run Regression Evals (domExtract)
226+
run: npm run evals category regression_dom_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=domExtract
227+
228+
- name: Save Regression domExtract Results
229+
run: mv eval-summary.json eval-summary-regression-dom.json
230+
231+
- name: Log and Regression (domExtract) Evals Performance
232+
id: set-dom-score
233+
run: |
234+
experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json)
235+
regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json)
236+
echo "regression_dom_extract category score: ${regression_dom_score}%"
237+
echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}"
238+
echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT"
239+
240+
run-regression-evals-text-extract:
241+
needs:
242+
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
243+
runs-on: ubuntu-latest
244+
timeout-minutes: 7
245+
outputs:
246+
regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }}
247+
env:
248+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
249+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
250+
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
251+
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
252+
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
253+
HEADLESS: true
254+
EVAL_ENV: browserbase
255+
steps:
256+
- name: Check out repository code
257+
uses: actions/checkout@v4
258+
259+
- name: Set up Node.js
260+
uses: actions/setup-node@v4
261+
with:
262+
node-version: "20"
263+
264+
- name: Install dependencies
265+
run: |
266+
rm -rf node_modules
267+
rm -f package-lock.json
268+
npm install
269+
270+
- name: Build Stagehand
271+
run: npm run build
272+
273+
- name: Install Playwright browsers
274+
run: npm exec playwright install --with-deps
275+
276+
- name: Run Regression Evals (textExtract)
277+
run: npm run evals category regression_text_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=textExtract
278+
279+
- name: Save Regression textExtract Results
280+
run: mv eval-summary.json eval-summary-regression-text.json
281+
282+
- name: Log Regression (textExtract) Evals Performance
283+
id: set-text-score
284+
run: |
285+
experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json)
286+
regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json)
287+
echo "regression_text_extract category score: ${regression_text_score}%"
288+
echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}"
289+
echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT"
290+
291+
check-regression-evals-score:
292+
needs: [run-regression-evals-text-extract, run-regression-evals-dom-extract]
293+
runs-on: ubuntu-latest
294+
timeout-minutes: 5
295+
steps:
296+
- name: Compare Overall Regression Evals Score
297+
run: |
298+
regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}"
299+
regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}"
300+
301+
overall_score=$(echo "(${regression_dom_score} + ${regression_text_score}) / 2" | bc -l)
302+
echo "Overall regression score: ${overall_score}%"
303+
304+
# Fail if overall score is below 90%
305+
if (( $(echo "${overall_score} < 90" | bc -l) )); then
306+
echo "Overall regression score is below 90%. Failing CI."
307+
exit 1
308+
fi
309+
186310
run-combination-evals:
187-
needs: [run-e2e-bb-tests, run-e2e-tests, determine-evals]
311+
needs: [check-regression-evals-score, determine-evals]
188312
runs-on: ubuntu-latest
189313
timeout-minutes: 40
190314
env:
@@ -199,27 +323,43 @@ jobs:
199323
- name: Check out repository code
200324
uses: actions/checkout@v4
201325

326+
- name: Check for 'combination' label
327+
id: label-check
328+
run: |
329+
if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then
330+
echo "has_label=false" >> $GITHUB_OUTPUT
331+
echo "No label for COMBINATION. Exiting with success."
332+
else
333+
echo "has_label=true" >> $GITHUB_OUTPUT
334+
fi
335+
202336
- name: Set up Node.js
337+
if: needs.determine-evals.outputs.run-combination == 'true'
203338
uses: actions/setup-node@v4
204339
with:
205340
node-version: "20"
206341

207342
- name: Install dependencies
343+
if: needs.determine-evals.outputs.run-combination == 'true'
208344
run: |
209345
rm -rf node_modules
210346
rm -f package-lock.json
211347
npm install
212348
213349
- name: Build Stagehand
350+
if: needs.determine-evals.outputs.run-combination == 'true'
214351
run: npm run build
215352

216353
- name: Install Playwright browsers
354+
if: needs.determine-evals.outputs.run-combination == 'true'
217355
run: npm exec playwright install --with-deps
218356

219357
- name: Run Combination Evals
358+
if: needs.determine-evals.outputs.run-combination == 'true'
220359
run: npm run evals category combination
221360

222361
- name: Log Combination Evals Performance
362+
if: needs.determine-evals.outputs.run-combination == 'true'
223363
run: |
224364
experimentName=$(jq -r '.experimentName' eval-summary.json)
225365
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"

evals/evals.config.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
{
2828
"name": "ionwave",
29-
"categories": ["act"]
29+
"categories": ["act", "regression_dom_extract"]
3030
},
3131
{
3232
"name": "nonsense_action",
@@ -79,7 +79,7 @@
7979
},
8080
{
8181
"name": "wichita",
82-
"categories": ["combination"]
82+
"categories": ["combination", "regression_dom_extract"]
8383
},
8484

8585
{
@@ -104,7 +104,7 @@
104104
},
105105
{
106106
"name": "extract_aigrant_companies",
107-
"categories": ["experimental", "text_extract"]
107+
"categories": ["experimental", "text_extract", "regression_text_extract"]
108108
},
109109
{
110110
"name": "extract_capacitor_info",
@@ -153,7 +153,7 @@
153153
},
154154
{
155155
"name": "extract_memorial_healthcare",
156-
"categories": ["extract"]
156+
"categories": ["extract", "regression_dom_extract"]
157157
},
158158
{
159159
"name": "extract_nhl_stats",
@@ -222,11 +222,11 @@
222222
},
223223
{
224224
"name": "observe_github",
225-
"categories": ["observe"]
225+
"categories": ["observe", "regression_text_extract"]
226226
},
227227
{
228228
"name": "observe_vantechjournal",
229-
"categories": ["observe"]
229+
"categories": ["observe", "regression_text_extract"]
230230
},
231231
{
232232
"name": "observe_amazon_add_to_cart",
@@ -254,7 +254,7 @@
254254
},
255255
{
256256
"name": "extract_hamilton_weather",
257-
"categories": ["targeted_extract"]
257+
"categories": ["targeted_extract", "regression_text_extract"]
258258
},
259259
{
260260
"name": "extract_regulations_table",
@@ -286,7 +286,7 @@
286286
},
287287
{
288288
"name": "scroll_75",
289-
"categories": ["act"]
289+
"categories": ["act", "regression_dom_extract"]
290290
}
291291
]
292292
}

types/evals.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ export const EvalCategorySchema = z.enum([
2525
"experimental",
2626
"text_extract",
2727
"targeted_extract",
28+
"regression_text_extract",
29+
"regression_dom_extract",
2830
]);
2931

3032
export type EvalCategory = z.infer<typeof EvalCategorySchema>;

0 commit comments

Comments
 (0)