Skip to content

add multiprocessing support for sanitization step #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ We provide a tool namely `bigcodebench.sanitize` to clean up the code:
bigcodebench.sanitize --samples samples.jsonl --calibrate
# Sanitized code will be produced to `samples-sanitized-calibrated.jsonl`

# 💡 Optionally run the sanitization step with multiprocessing to speedup
bigcodebench.sanitize --samples samples.jsonl --calibrate --parallel 8

# 💡 If you want to get the original results:
bigcodebench.sanitize --samples samples.jsonl
# Sanitized code will be produced to `samples-sanitized.jsonl`
Expand Down
3 changes: 2 additions & 1 deletion Requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
appdirs>=1.4.4
fire>=0.6.0
multipledispatch>=0.6.0
pqdm>=0.2.0
tempdir>=0.7.1
termcolor>=2.0.0
tqdm>=4.56.0
tree_sitter_languages>=1.10.2
tree-sitter==0.21.3
wget>=3.2
wget>=3.2
95 changes: 62 additions & 33 deletions bigcodebench/sanitize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pathlib
from typing import Dict, Generator, List, Optional, Set, Tuple
from pqdm.processes import pqdm

from tqdm import tqdm
from tree_sitter import Node
Expand Down Expand Up @@ -178,8 +179,48 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
return sanitized_output


def process_solution(
sample_solution: Dict,
dataset: Dict,
entry_point: Dict,
debug_task: str = None,
calibrate: bool = False,
is_folder: bool = False,
target_path: str = None,
):

task_id = sample_solution.get("task_id")
if not task_id or task_id not in dataset:
return None

dbg_identifier = sample_solution["_identifier"]
if debug_task is not None and task_id != debug_task:
return None

function_name = entry_point.get(task_id)
old_code = sample_solution.get("solution")

if old_code is None:
assert "completion" in sample_solution, sample_solution
old_code = dataset[task_id]["complete_prompt"] + "\n" + sample_solution.get("completion")
else:
if calibrate:
old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ")

new_code = sanitize(code=old_code, entrypoint=function_name)

# if old code and new code are different, print msg
if new_code != old_code:
msg = "Sanitized: " + dbg_identifier
if is_folder:
msg += " -> " + dbg_identifier.replace(samples, target_path)
print(msg)

return {"task_id": task_id, "solution": new_code}


def script(
samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False
samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
):
# task_id -> entry_point
entry_point = {}
Expand Down Expand Up @@ -211,38 +252,26 @@ def script(

new_solutions = []

for solution in tqdm(load_solutions(samples)):
task_id = solution["task_id"]
if task_id not in dataset:
print(
f"Skiping {task_id} as it does not existing in the latest EvalPlus dataset."
)
continue

function_name = entry_point[task_id] if task_id in entry_point else None
dbg_identifier = solution["_identifier"]
if debug_task is not None and task_id != debug_task:
continue

ntotal += 1
if "solution" in solution:
old_code = solution["solution"]
if calibrate:
old_code = solution["solution"].replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ")
else:
assert "completion" in solution
old_code = dataset[task_id]["complete_prompt"] + "\n" + solution["completion"]

new_code = sanitize(code=old_code, entrypoint=function_name)
# if changed, print the message
if new_code != old_code:
msg = "Sanitized: " + dbg_identifier
if is_folder:
msg += " -> " + dbg_identifier.replace(samples, target_path)
print(msg)
parallel_arg_list = [
{
"sample_solution": sample_solution,
"dataset": dataset,
"entry_point": entry_point,
"debug_task": debug_task,
"calibrate": calibrate,
"is_folder": is_folder,
"target_path": target_path
}
for sample_solution in load_solutions(samples)
]

results = pqdm(parallel_arg_list, process_solution, n_jobs=min(parallel, os.cpu_count()), argument_type="kwargs")

for result in results:
if result is not None:
new_solutions.append(result)
nsan += 1

new_solutions.append({"task_id": task_id, "solution": new_code})
ntotal += 1

if is_folder:
write_directory(target_path, new_solutions)
Expand All @@ -263,4 +292,4 @@ def main():


if __name__ == "__main__":
main()
main()