diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index e005c5d..9e1fd45 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -189,10 +189,6 @@ def evaluate( # run the evaluation print(f"Command run in sandbox {e2b_endpoint}") - if not isinstance(pass_k, str): - pass_k = ",".join(pass_k) - if not isinstance(selective_evaluate, str): - selective_evaluate = ",".join(selective_evaluate) sandbox.commands.run("bigcodebench.evaluate --execution 'local' " f"--split {split} --subset {subset} --samples {samples} " f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} " @@ -209,9 +205,16 @@ def evaluate( else: pass_at_k = dict() - - passk = [int(k) for k in pass_k.split(",")] + passk = list(pass_k) + if isinstance(selective_evaluate, str): + selected_ids = set(selective_evaluate.split(",")) + else: + try: + selected_ids = set(selective_evaluate) + except: + selected_ids = "" + if parallel < 1: n_workers = max(1, multiprocessing.cpu_count() // 2) else: @@ -224,11 +227,7 @@ def evaluate( problems = get_bigcodebench(subset=subset) # Add selective evaluation logic - if selective_evaluate: - if isinstance(selective_evaluate, str): - selected_ids = set(selective_evaluate.split(",")) - else: - selected_ids = set(selective_evaluate) + if selected_ids: problems = {k: v for k, v in problems.items() if k in selected_ids} if not problems: raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset") diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index c6ba2ca..d0e51f4 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -33,7 +33,7 @@ RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench RUN pip install numpy==1.24.3 pyarrow==14.0.1 -RUN cd /bigcodebench && \ +RUN cd /bigcodebench && git checkout e2b_debug && \ pip install . --no-deps RUN pip install --timeout 2000 \