From 6f1c33d2853294acb04f4332e192a21bad0ea98c Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 20:14:17 +0800 Subject: [PATCH 01/13] fix e2b --- bigcodebench/evaluate.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index e005c5d..16bd67b 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -189,10 +189,6 @@ def evaluate( # run the evaluation print(f"Command run in sandbox {e2b_endpoint}") - if not isinstance(pass_k, str): - pass_k = ",".join(pass_k) - if not isinstance(selective_evaluate, str): - selective_evaluate = ",".join(selective_evaluate) sandbox.commands.run("bigcodebench.evaluate --execution 'local' " f"--split {split} --subset {subset} --samples {samples} " f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} " @@ -210,8 +206,15 @@ def evaluate( pass_at_k = dict() - passk = [int(k) for k in pass_k.split(",")] - + if isinstance(pass_k, str): + passk = [int(k) for k in pass_k.split(",")] + else: + passk = pass_k + if isinstance(selective_evaluate, str): + selected_ids = set(selective_evaluate.split(",")) + else: + selected_ids = set(selective_evaluate) + if parallel < 1: n_workers = max(1, multiprocessing.cpu_count() // 2) else: From 8b79fc446a7cdefc0a159a16a7067c1f2227caa4 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 21:09:30 +0800 Subject: [PATCH 02/13] fix --- sandbox-templates/e2b.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index c6ba2ca..bf0c799 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit +# ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench RUN pip install numpy==1.24.3 pyarrow==14.0.1 From 75469f4631be357f4d47996b8d753b338b5fd630 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 21:31:14 +0800 Subject: [PATCH 03/13] fix --- sandbox-templates/e2b.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index bf0c799..c6ba2ca 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -# ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit +ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench RUN pip install numpy==1.24.3 pyarrow==14.0.1 From 9c5726aab90262d91dcafd23daae34bc1598cb03 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 21:32:44 +0800 Subject: [PATCH 04/13] fix --- bigcodebench/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 16bd67b..d645eed 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -210,6 +210,7 @@ def evaluate( passk = [int(k) for k in pass_k.split(",")] else: passk = pass_k + if isinstance(selective_evaluate, str): selected_ids = set(selective_evaluate.split(",")) else: From de90e7eda66a256402db7b5a2719a38ea2dc6183 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 21:33:58 +0800 Subject: [PATCH 05/13] fix --- sandbox-templates/e2b.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index c6ba2ca..d0e51f4 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -33,7 +33,7 @@ RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench RUN pip install numpy==1.24.3 pyarrow==14.0.1 -RUN cd /bigcodebench && \ +RUN cd /bigcodebench && git checkout e2b_debug && \ pip install . --no-deps RUN pip install --timeout 2000 \ From 1dc1e37449eb59d590bf9a69feaa10875d9442f0 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 22:50:33 +0800 Subject: [PATCH 06/13] fix --- bigcodebench/evaluate.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index d645eed..bac8963 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -214,7 +214,10 @@ def evaluate( if isinstance(selective_evaluate, str): selected_ids = set(selective_evaluate.split(",")) else: - selected_ids = set(selective_evaluate) + try: + selected_ids = set(selective_evaluate) + except: + selected_ids = {} if parallel < 1: n_workers = max(1, multiprocessing.cpu_count() // 2) @@ -229,10 +232,6 @@ def evaluate( # Add selective evaluation logic if selective_evaluate: - if isinstance(selective_evaluate, str): - selected_ids = set(selective_evaluate.split(",")) - else: - selected_ids = set(selective_evaluate) problems = {k: v for k, v in problems.items() if k in selected_ids} if not problems: raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset") From cb1ddd097274ed857867951492ef0d8f59090f5e Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 22:51:07 +0800 Subject: [PATCH 07/13] fix --- sandbox-templates/e2b.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index d0e51f4..d11653d 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit +# ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench RUN pip install numpy==1.24.3 pyarrow==14.0.1 From 035221bd2536f0ecbec88fe710f141f8c01f7fea Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 13 Feb 2025 22:52:04 +0800 Subject: [PATCH 08/13] fix --- sandbox-templates/e2b.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index d11653d..512036c 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,8 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -# ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit +RUN echo +ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench RUN pip install numpy==1.24.3 pyarrow==14.0.1 From d32f19eed04404a63ab5ee923c6a60956ce542b9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 14 Feb 2025 01:14:09 +0800 Subject: [PATCH 09/13] fix --- sandbox-templates/e2b.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index 512036c..daa735a 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -RUN echo +RUN echo 1 ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench From 5d42541ec3b3a72f46e9c3aa85fddf6b67a23368 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 14 Feb 2025 01:56:11 +0800 Subject: [PATCH 10/13] fix --- bigcodebench/evaluate.py | 2 +- sandbox-templates/e2b.Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index bac8963..7a59a89 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -217,7 +217,7 @@ def evaluate( try: selected_ids = set(selective_evaluate) except: - selected_ids = {} + selected_ids = "" if parallel < 1: n_workers = max(1, multiprocessing.cpu_count() // 2) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index daa735a..de3489e 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -RUN echo 1 +RUN echo 2 ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench From 5091ff0ef390e99fcdc0b72885142c7ab4852314 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 14 Feb 2025 03:08:40 +0800 Subject: [PATCH 11/13] fix --- bigcodebench/evaluate.py | 2 +- sandbox-templates/e2b.Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 7a59a89..d6d061f 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -231,7 +231,7 @@ def evaluate( problems = get_bigcodebench(subset=subset) # Add selective evaluation logic - if selective_evaluate: + if selected_ids: problems = {k: v for k, v in problems.items() if k in selected_ids} if not problems: raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset") diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index de3489e..626cbae 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -RUN echo 2 +RUN echo 3 ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench From 0cd61745993f819076733681a40daf445fc970a9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 14 Feb 2025 03:52:17 +0800 Subject: [PATCH 12/13] fix --- bigcodebench/evaluate.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index d6d061f..9e1fd45 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -205,11 +205,7 @@ def evaluate( else: pass_at_k = dict() - - if isinstance(pass_k, str): - passk = [int(k) for k in pass_k.split(",")] - else: - passk = pass_k + passk = list(pass_k) if isinstance(selective_evaluate, str): selected_ids = set(selective_evaluate.split(",")) From 5cfe22b262580a06a6b12354c009b6a0e8ced893 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 14 Feb 2025 03:52:38 +0800 Subject: [PATCH 13/13] fix --- sandbox-templates/e2b.Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile index 626cbae..d0e51f4 100644 --- a/sandbox-templates/e2b.Dockerfile +++ b/sandbox-templates/e2b.Dockerfile @@ -28,7 +28,6 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser RUN rm -rf /bigcodebench # Acquire benchmark code to local -RUN echo 3 ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench