@@ -189,10 +189,6 @@ def evaluate(
189
189
190
190
# run the evaluation
191
191
print (f"Command run in sandbox { e2b_endpoint } " )
192
- if not isinstance (pass_k , str ):
193
- pass_k = "," .join (pass_k )
194
- if not isinstance (selective_evaluate , str ):
195
- selective_evaluate = "," .join (selective_evaluate )
196
192
sandbox .commands .run ("bigcodebench.evaluate --execution 'local' "
197
193
f"--split { split } --subset { subset } --samples { samples } "
198
194
f"--pass_k { pass_k } --save_pass_rate { save_pass_rate } --calibrated { calibrated } "
@@ -209,9 +205,16 @@ def evaluate(
209
205
else :
210
206
211
207
pass_at_k = dict ()
212
-
213
- passk = [int (k ) for k in pass_k .split ("," )]
208
+ passk = list (pass_k )
214
209
210
+ if isinstance (selective_evaluate , str ):
211
+ selected_ids = set (selective_evaluate .split ("," ))
212
+ else :
213
+ try :
214
+ selected_ids = set (selective_evaluate )
215
+ except :
216
+ selected_ids = ""
217
+
215
218
if parallel < 1 :
216
219
n_workers = max (1 , multiprocessing .cpu_count () // 2 )
217
220
else :
@@ -224,11 +227,7 @@ def evaluate(
224
227
problems = get_bigcodebench (subset = subset )
225
228
226
229
# Add selective evaluation logic
227
- if selective_evaluate :
228
- if isinstance (selective_evaluate , str ):
229
- selected_ids = set (selective_evaluate .split ("," ))
230
- else :
231
- selected_ids = set (selective_evaluate )
230
+ if selected_ids :
232
231
problems = {k : v for k , v in problems .items () if k in selected_ids }
233
232
if not problems :
234
233
raise ValueError (f"None of the provided task IDs { selected_ids } were found in the dataset" )
0 commit comments