Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data/countdown/test.parquet
Binary file not shown.
Binary file added data/countdown/train.parquet
Binary file not shown.
20,000 changes: 20,000 additions & 0 deletions data/cryptarithm/test.jsonl

Large diffs are not rendered by default.

Binary file added data/cryptarithm/test.parquet
Binary file not shown.
80,000 changes: 80,000 additions & 0 deletions data/cryptarithm/train.jsonl

Large diffs are not rendered by default.

Binary file added data/cryptarithm/train.parquet
Binary file not shown.
10 changes: 6 additions & 4 deletions examples/data_preprocess/countdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,18 @@ def make_prefix(dp, template_type):


if __name__ == '__main__':
default_num_samples = 10000

parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='~/data/countdown')
parser.add_argument('--local_dir', default='/workspace/TinyZeroGRPO/data/countdown')
parser.add_argument('--hdfs_dir', default=None)
parser.add_argument('--num_samples', type=int, default=100000)
parser.add_argument('--num_samples', type=int, default=default_num_samples)
parser.add_argument('--num_operands', type=int, default=6)
parser.add_argument('--max_target', type=int, default=1000)
parser.add_argument('--min_number', type=int, default=1)
parser.add_argument('--max_number', type=int, default=100)
parser.add_argument('--train_size', type=int, default=327680)
parser.add_argument('--test_size', type=int, default=1024)
parser.add_argument('--train_size', type=int, default=int(0.8 * default_num_samples + 1))
parser.add_argument('--test_size', type=int, default=int(0.2 * default_num_samples + 1))
parser.add_argument('--template_type', type=str, default='base')

args = parser.parse_args()
Expand Down
87 changes: 87 additions & 0 deletions examples/data_preprocess/cryptarithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import re
import os
import datasets

from verl.utils.hdfs_io import copy, makedirs
import argparse

def extract_solution(solution_str):
if "Assistant:" in solution_str:
solution_str = solution_str.split("Assistant:", 1)[-1]
elif "<|im_start|>assistant" in solution_str:
solution_str = solution_str.split("<|im_start|>assistant", 1)[-1]

answer_pattern = r'<answer>\s*(\d+)\s*\+\s*(\d+)\s*\+\s*(\d+)\s*=\s*(\d+)\s*</answer>'
matches = list(re.finditer(answer_pattern, solution_str))

if matches:
match = matches[-1]
return (match.group(1), match.group(2), match.group(3), match.group(4))
else:
return None


def make_prefix(equation):
question = f"""A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The user highly desires that the assistant does not give up, produces high quality thoughts, and produces many thoughts. User: Cryptarithms are puzzles in which the digits of a mathematical equation—often a simple addition problem—are replaced by letters or symbols. Each letter uniquely represents a single digit, and the challenge is to determine the correct digit for each letter so that the arithmetic operation is valid. Please solve this Cryptarithm. {equation[0]} + {equation[1]} + {equation[2]} = {equation[3]}. Show your work in <think> </think> tags. And return the decrypted equation's numbers, not symbols, in <answer> </answer> tags, for example <answer> 1948 + 3756 + 9574 = 15278 </answer>. Assistant: Let me solve this accurately step by step. <think> """

return question


if __name__ == '__main__':
default_num_samples = 2000

parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='/workspace/TinyZeroGRPO/data/cryptarithm')
parser.add_argument('--hdfs_dir', default=None)
parser.add_argument('--train_size', default=int(0.8 * default_num_samples + 1))
parser.add_argument('--test_size', default=int(0.2 * default_num_samples + 1))

args = parser.parse_args()

full_train_dataset = datasets.load_dataset("json", data_files="/workspace/TinyZeroGRPO/data/cryptarithm/train.jsonl", split='train', field=None)
full_test_dataset = datasets.load_dataset("json", data_files="/workspace/TinyZeroGRPO/data/cryptarithm/test.jsonl", split='train', field=None)
TRAIN_SIZE = args.train_size
TEST_SIZE = args.test_size

assert len(full_train_dataset) >= TRAIN_SIZE
assert len(full_test_dataset) >= TEST_SIZE

train_dataset = full_train_dataset.select(range(TRAIN_SIZE))
test_dataset = full_test_dataset.select(range(TEST_SIZE))

def make_map_fn(split):
def process_fn(example, idx):
question = make_prefix(example['encrypted_equation'])

data = {
"data_source": "cryptarithm",
"prompt": [{
"role": "user",
"content": question,
}],
"ability": "math",
"reward_model": {
"encrypted_equation_str": example['encrypted_equation_str'], # For model string
"encrypted_equation": example['encrypted_equation'], # For reward model string array
"num_solutions": example['num_solutions']
},
"extra_info": {
'split': split, # Train or test
'index': idx,
}
}
return data
return process_fn

train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)

local_dir = args.local_dir
hdfs_dir = args.hdfs_dir

train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))

if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
74 changes: 74 additions & 0 deletions examples/frun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
set -x

export VLLM_ATTENTION_BACKEND=XFORMERS

python3 -m verl.trainer.main_ppo \
data.train_files=/workspace/TinyZeroGRPO/data/countdown/train.parquet \
data.val_files=/workspace/TinyZeroGRPO/data/countdown/test.parquet \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.train_batch_size=128 \
data.val_batch_size=128 \
# # data.return_raw_input_ids=False \
# # data.return_raw_chat=False \
# # actor_rollout_ref.hybrid_engine=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
# # actor_rollout_ref.model.external_lib=null \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.use_remove_padding=True \
# # actor_rollout_ref.actor.strategy=fsdp \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
# actor_rollout_ref.actor.ppo_micro_batch_size=16 \
# actor_rollout_ref.actor.grad_clip=1.0 \
# actor_rollout_ref.actor.clip_ratio=0.2 \
# actor_rollout_ref.actor.entropy_coeff=0.001 \
# actor_rollout_ref.actor.ppo_epochs=1 \
# actor_rollout_ref.actor.shuffle=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
# actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0 \
# actor_rollout_ref.actor.optim.warmup_style=constant \
# actor_rollout_ref.actor.optim.total_training_steps=-1 \
# actor_rollout_ref.actor.fsdp_config.wrap_policy.min_num_params=0 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
# actor_rollout_ref.ref.fsdp_config.wrap_policy.min_num_params=0 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
# actor_rollout_ref.rollout.n=1 \
# actor_rollout_ref.rollout.name=vllm \
# actor_rollout_ref.rollout.temperature=1.0 \
# actor_rollout_ref.rollout.top_k=-1 \
# actor_rollout_ref.rollout.top_p=1 \
# actor_rollout_ref.rollout.response_length='${data.max_response_length}' \
# actor_rollout_ref.rollout.dtype=float16 \
# actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
# actor_rollout_ref.rollout.ignore_eos=False \
# actor_rollout_ref.rollout.enforce_eager=True \
# actor_rollout_ref.rollout.free_cache_engine=True \
# actor_rollout_ref.rollout.load_format=dummy_dtensor \
# actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
# actor_rollout_ref.rollout.max_num_seqs=512 \
# actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
# actor_rollout_ref.rollout.do_sample=True \
# # data.tokenizer=null \
# # data.prompt_key=prompt
# algorithm.gamma=1.0 \
# algorithm.lam=1.0 \
algorithm.adv_estimator=grpo \
# algorithm.kl_penalty=kl \
# algorithm.kl_ctrl.type=fixed \
algorithm.kl_ctrl.kl_coef=0.001 \
# trainer.total_epochs=1 \
# trainer.project_name=rented_test \
# trainer.experiment_name=test \
trainer.logger=['console'] \
trainer.nnodes=1 \
trainer.n_gpus_per_node=2 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
trainer.save_freq=-1 \
trainer.test_freq=-1 \
trainer.critic_warmup=0 $@
6 changes: 3 additions & 3 deletions examples/grpo_trainer/run_qwen2-7b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ export VLLM_ATTENTION_BACKEND=XFORMERS

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_files=$HOME/data/cryptarithm/train.parquet \
data.val_files=$HOME/data/cryptarithm/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
Expand Down
4 changes: 2 additions & 2 deletions examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ export VLLM_ATTENTION_BACKEND=XFORMERS

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_files=/workspace/data/gsm8k/train.parquet \
data.val_files=/workspace/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
Expand Down
47 changes: 47 additions & 0 deletions examples/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
set -x

export VLLM_ATTENTION_BACKEND=XFORMERS
export RAY_BACKEND_LOG_LEVEL=debug
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/workspace/TinyZeroGRPO/data/cryptarithm/train.parquet \
data.val_files=/workspace/TinyZeroGRPO/data/cryptarithm/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=128 \
data.max_prompt_length=512 \
data.max_response_length=2048 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=1 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.rollout.dtype=float16 \
actor_rollout_ref.rollout.ignore_eos=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger.0='console' \
trainer.logger.1='wandb' \
trainer.project_name='test_rent' \
trainer.experiment_name='test' \
trainer.n_gpus_per_node=2 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@
# trainer.total_training_steps=3 \
16 changes: 12 additions & 4 deletions examples/split_placement/main_ppo_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,22 @@ def __call__(self, data: DataProto):
sequences = torch.cat((valid_prompt_ids, valid_response_ids))
sequences_str = self.tokenizer.decode(sequences)

ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']

# select rm_score
data_source = data_item.non_tensor_batch['data_source']

ground_truth = None
if data_source == 'countdown':
ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']
elif data_source == 'cryptarithm':
ground_truth = data_item.non_tensor_batch['reward_model']['encrypted_equation']

compute_score_fn = _select_rm_score_fn(data_source)

score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth)
reward_tensor[i, valid_response_length - 1] = score
if ground_truth is not None:
score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth)
reward_tensor[i, valid_response_length - 1] = score
else:
reward_tensor[i, valid_response_length - 1] = 0.0

if data_source not in already_print_data_sources:
already_print_data_sources[data_source] = 0
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@ dependencies = [
"codetiming",
"datasets",
"dill",
"flash_attn",
"hydra-core",
"numpy",
"pybind11",
"ray",
"tensordict",
"torchaudio<=2.4.0",
"transformers<4.48",
"vllm<=0.6.3",
]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ accelerate
codetiming
datasets
dill
flash-attn
flash_attn
hydra-core
numpy
pandas
pybind11
ray
tensordict<0.6
torchaudio<=4.47.0
transformers<4.48
vllm<=0.6.3
wandb
1 change: 1 addition & 0 deletions ssh_18473463.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ipaddr": "172.219.157.164", "port": 14441}
Binary file added train.parquet
Binary file not shown.
24 changes: 18 additions & 6 deletions verl/trainer/main_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@

from verl import DataProto
import torch
from verl.utils.reward_score import gsm8k, math, multiply, countdown
from verl.utils.reward_score import gsm8k, math, multiply, countdown, cryptarithm
from verl.trainer.ppo.ray_trainer import RayPPOTrainer


def _select_rm_score_fn(data_source):
if data_source == 'openai/gsm8k':
if data_source == 'cryptarithm':
return cryptarithm.compute_score
elif data_source == 'openai/gsm8k':
return gsm8k.compute_score
elif data_source == 'lighteval/MATH':
return math.compute_score
Expand Down Expand Up @@ -71,12 +73,22 @@ def __call__(self, data: DataProto):
sequences = torch.cat((valid_prompt_ids, valid_response_ids))
sequences_str = self.tokenizer.decode(sequences)

ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']

# select rm_score
data_source = data_item.non_tensor_batch['data_source']

ground_truth = None
if data_source == 'countdown':
ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']
elif data_source == 'cryptarithm':
ground_truth = data_item.non_tensor_batch['reward_model']['encrypted_equation']

compute_score_fn = _select_rm_score_fn(data_source)

if ground_truth is not None:
score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth)
reward_tensor[i, valid_response_length - 1] = score
else:
reward_tensor[i, valid_response_length - 1] = 0.0

score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth)
reward_tensor[i, valid_response_length - 1] = score

Expand Down Expand Up @@ -111,7 +123,7 @@ def main_task(config):
# print initial config
from pprint import pprint
from omegaconf import OmegaConf
pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
# pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
OmegaConf.resolve(config)

# download the checkpoint from hdfs
Expand Down
3 changes: 2 additions & 1 deletion verl/utils/logger/aggregate_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def concat_dict_to_str(dict: Dict, step):
if isinstance(v, numbers.Number):
output.append(f'{k}:{v:.3f}')
output_str = ' - '.join(output)
return output_str
# return output_str
return ''


class LocalLogger:
Expand Down
2 changes: 1 addition & 1 deletion verl/utils/reward_score/countdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def compute_score(solution_str, ground_truth, method='strict', format_score=0.1,
numbers = ground_truth['numbers']

equation = extract_solution(solution_str=solution_str)
do_print = random.randint(1, 64) == 1
do_print = random.randint(1, 800) == 1

if do_print:
print(f"--------------------------------")
Expand Down
Loading