20
20
determine-evals :
21
21
runs-on : ubuntu-latest
22
22
outputs :
23
+ run-combination : ${{ steps.check-labels.outputs.run-combination }}
23
24
run-extract : ${{ steps.check-labels.outputs.run-extract }}
24
25
run-act : ${{ steps.check-labels.outputs.run-act }}
25
26
run-observe : ${{ steps.check-labels.outputs.run-observe }}
31
32
# Default to running all tests on main branch
32
33
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
33
34
echo "Running all tests for main branch"
35
+ echo "run-combination=true" >> $GITHUB_OUTPUT
34
36
echo "run-extract=true" >> $GITHUB_OUTPUT
35
37
echo "run-act=true" >> $GITHUB_OUTPUT
36
38
echo "run-observe=true" >> $GITHUB_OUTPUT
40
42
fi
41
43
42
44
# Check for specific labels
45
+ echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT
43
46
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT
44
47
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
45
48
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
@@ -147,7 +150,7 @@ jobs:
147
150
run : npm run e2e:local
148
151
149
152
run-e2e-bb-tests :
150
- needs : [run-e2e-tests ]
153
+ needs : [run-lint, run-build ]
151
154
runs-on : ubuntu-latest
152
155
timeout-minutes : 50
153
156
if : >
@@ -183,8 +186,129 @@ jobs:
183
186
- name : Run E2E Tests (browserbase)
184
187
run : npm run e2e:bb
185
188
189
+ run-regression-evals-dom-extract :
190
+ needs :
191
+ [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
192
+ runs-on : ubuntu-latest
193
+ timeout-minutes : 7
194
+ outputs :
195
+ regression_dom_score : ${{ steps.set-dom-score.outputs.regression_dom_score }}
196
+ env :
197
+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
198
+ ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
199
+ BRAINTRUST_API_KEY : ${{ secrets.BRAINTRUST_API_KEY }}
200
+ BROWSERBASE_API_KEY : ${{ secrets.BROWSERBASE_API_KEY }}
201
+ BROWSERBASE_PROJECT_ID : ${{ secrets.BROWSERBASE_PROJECT_ID }}
202
+ HEADLESS : true
203
+ EVAL_ENV : browserbase
204
+ steps :
205
+ - name : Check out repository code
206
+ uses : actions/checkout@v4
207
+
208
+ - name : Set up Node.js
209
+ uses : actions/setup-node@v4
210
+ with :
211
+ node-version : " 20"
212
+
213
+ - name : Install dependencies
214
+ run : |
215
+ rm -rf node_modules
216
+ rm -f package-lock.json
217
+ npm install
218
+
219
+ - name : Build Stagehand
220
+ run : npm run build
221
+
222
+ - name : Install Playwright browsers
223
+ run : npm exec playwright install --with-deps
224
+
225
+ - name : Run Regression Evals (domExtract)
226
+ run : npm run evals category regression_dom_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=domExtract
227
+
228
+ - name : Save Regression domExtract Results
229
+ run : mv eval-summary.json eval-summary-regression-dom.json
230
+
231
+ - name : Log and Regression (domExtract) Evals Performance
232
+ id : set-dom-score
233
+ run : |
234
+ experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json)
235
+ regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json)
236
+ echo "regression_dom_extract category score: ${regression_dom_score}%"
237
+ echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}"
238
+ echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT"
239
+
240
+ run-regression-evals-text-extract :
241
+ needs :
242
+ [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
243
+ runs-on : ubuntu-latest
244
+ timeout-minutes : 7
245
+ outputs :
246
+ regression_text_score : ${{ steps.set-text-score.outputs.regression_text_score }}
247
+ env :
248
+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
249
+ ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
250
+ BRAINTRUST_API_KEY : ${{ secrets.BRAINTRUST_API_KEY }}
251
+ BROWSERBASE_API_KEY : ${{ secrets.BROWSERBASE_API_KEY }}
252
+ BROWSERBASE_PROJECT_ID : ${{ secrets.BROWSERBASE_PROJECT_ID }}
253
+ HEADLESS : true
254
+ EVAL_ENV : browserbase
255
+ steps :
256
+ - name : Check out repository code
257
+ uses : actions/checkout@v4
258
+
259
+ - name : Set up Node.js
260
+ uses : actions/setup-node@v4
261
+ with :
262
+ node-version : " 20"
263
+
264
+ - name : Install dependencies
265
+ run : |
266
+ rm -rf node_modules
267
+ rm -f package-lock.json
268
+ npm install
269
+
270
+ - name : Build Stagehand
271
+ run : npm run build
272
+
273
+ - name : Install Playwright browsers
274
+ run : npm exec playwright install --with-deps
275
+
276
+ - name : Run Regression Evals (textExtract)
277
+ run : npm run evals category regression_text_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=textExtract
278
+
279
+ - name : Save Regression textExtract Results
280
+ run : mv eval-summary.json eval-summary-regression-text.json
281
+
282
+ - name : Log Regression (textExtract) Evals Performance
283
+ id : set-text-score
284
+ run : |
285
+ experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json)
286
+ regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json)
287
+ echo "regression_text_extract category score: ${regression_text_score}%"
288
+ echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}"
289
+ echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT"
290
+
291
+ check-regression-evals-score :
292
+ needs : [run-regression-evals-text-extract, run-regression-evals-dom-extract]
293
+ runs-on : ubuntu-latest
294
+ timeout-minutes : 5
295
+ steps :
296
+ - name : Compare Overall Regression Evals Score
297
+ run : |
298
+ regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}"
299
+ regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}"
300
+
301
+ overall_score=$(echo "(${regression_dom_score} + ${regression_text_score}) / 2" | bc -l)
302
+ echo "Overall regression score: ${overall_score}%"
303
+
304
+ # Fail if overall score is below 90%
305
+ if (( $(echo "${overall_score} < 90" | bc -l) )); then
306
+ echo "Overall regression score is below 90%. Failing CI."
307
+ exit 1
308
+ fi
309
+
186
310
run-combination-evals :
187
- needs : [run-e2e-bb-tests, run-e2e-tests , determine-evals]
311
+ needs : [check-regression-evals-score , determine-evals]
188
312
runs-on : ubuntu-latest
189
313
timeout-minutes : 40
190
314
env :
@@ -199,27 +323,43 @@ jobs:
199
323
- name : Check out repository code
200
324
uses : actions/checkout@v4
201
325
326
+ - name : Check for 'combination' label
327
+ id : label-check
328
+ run : |
329
+ if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then
330
+ echo "has_label=false" >> $GITHUB_OUTPUT
331
+ echo "No label for COMBINATION. Exiting with success."
332
+ else
333
+ echo "has_label=true" >> $GITHUB_OUTPUT
334
+ fi
335
+
202
336
- name : Set up Node.js
337
+ if : needs.determine-evals.outputs.run-combination == 'true'
203
338
uses : actions/setup-node@v4
204
339
with :
205
340
node-version : " 20"
206
341
207
342
- name : Install dependencies
343
+ if : needs.determine-evals.outputs.run-combination == 'true'
208
344
run : |
209
345
rm -rf node_modules
210
346
rm -f package-lock.json
211
347
npm install
212
348
213
349
- name : Build Stagehand
350
+ if : needs.determine-evals.outputs.run-combination == 'true'
214
351
run : npm run build
215
352
216
353
- name : Install Playwright browsers
354
+ if : needs.determine-evals.outputs.run-combination == 'true'
217
355
run : npm exec playwright install --with-deps
218
356
219
357
- name : Run Combination Evals
358
+ if : needs.determine-evals.outputs.run-combination == 'true'
220
359
run : npm run evals category combination
221
360
222
361
- name : Log Combination Evals Performance
362
+ if : needs.determine-evals.outputs.run-combination == 'true'
223
363
run : |
224
364
experimentName=$(jq -r '.experimentName' eval-summary.json)
225
365
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
0 commit comments