Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/agentic_demo/agent_val_rock_swe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ reference:
infer_batch_size: 1

reward_normalization:
grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
method: mean
grouping: traj_group_id
method: identity
# norm_mean_type: batch
# norm_std_type: group

Expand Down
228 changes: 228 additions & 0 deletions examples/agentic_demo/agent_val_rock_swe_qwen35_2b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
defaults:
- ../config/traj_envs@_here_
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
run:
dir: .
output_subdir: null

exp_name: "agentic_rollout_swe_qwen35_2b"
seed: 42

logging_dir: ./output/logs
output_dir: ./output
model_name: ${exp_name}-${now:%Y%m%d_%H%M%S}
rollout_dump_dir: ./output/rollout_dump
system_envs:
USE_MODELSCOPE: '1'

checkpoint_config:
type: file_system
output_dir: /data

num_gpus_per_node: 8
rpc_timeout: 72000

max_steps: 200
save_steps: 50
logging_steps: 1
eval_steps: 0
resume_from_checkpoint: false

async_generation_ratio: 1
# Qwen3.5 tool-call templates expect structured tool arguments.
parse_tool_call_parameter_to_dict: true
# Qwen3.5 chat templates do not auto-insert an empty system prompt.
skip_mock_system_prompt: true

rollout_batch_size: 4
val_batch_size: 4
sequence_length: 32768

max_tokens_per_step: 4096

advantage_clip: 0.2
ppo_epochs: 1
adv_estimator: "step_reinforce"
batch_adjust_mode: "random_sample"
step_reward_gamma: 1.0

init_kl_coef: 0.0
whiten_advantages: true
entropy_loss_coef: 0
max_grad_norm: 1.0

# Swap these checkpoints to reuse the same setup with other Qwen3.5 dense variants.
pretrain: Qwen/Qwen3.5-2B
reward_pretrain: Qwen/Qwen3.5-2B

actor_train:
model_args:
# fa2 doesn't work for now
flash_attn: sdpa
attn_implementation: sdpa
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
freeze_module_prefix: vision_model
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
warmup_steps: 0
data_args:
template: qwen3_coder
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
context_parallel_size: 2
sequence_parallel: true
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,4))
infer_batch_size: 1

actor_infer:
model_args:
flash_attn: sdpa
attn_implementation: sdpa
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${max_tokens_per_step}
top_p: 1.0
top_k: 50
num_beams: 1
temperature: 1.0
num_return_sequences: 1
stop_strings: ["</tool_call>", "</tool_call>\n", "\n</tool_call>\n", "\n</function>"]
include_stop_str_in_output: true
data_args:
template: qwen3_coder
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.6
block_size: 16
load_format: auto
tensor_parallel_size: 1
max_model_len: 32768
device_mapping: list(range(0,8))

reference:
model_args:
attn_implementation: sdpa
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
freeze_module_prefix: vision_model
data_args:
template: qwen3_coder
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
context_parallel_size: 2
device_mapping: list(range(0,4))
infer_batch_size: 1

reward_normalization:
grouping: traj_group_id
method: identity

train_env_manager:
max_env_num_per_worker: 1
num_env_groups: 1
group_size: 4
tags: [RockTBNativeEnvTrain]
num_groups_partition: [1]
system_envs:
ROCK_RTENV_PYTHON_V31114_INSTALL_CMD: '[ -f cpython31115.tar.gz ] && rm cpython31115.tar.gz; [ -d python ] && rm -rf python; wget -q -O cpython31115.tar.gz https://mirror.nju.edu.cn/github-release/astral-sh/python-build-standalone/20260303/cpython-3.11.15+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz && tar -xzf cpython31115.tar.gz && mv python runtime-env'

val_env_manager:
max_env_num_per_worker: 1
num_env_groups: 1
group_size: 4
tags: [swebench_native_verified]
num_groups_partition: [1]
system_envs:
ROCK_RTENV_PYTHON_V31114_INSTALL_CMD: '[ -f cpython31115.tar.gz ] && rm cpython31115.tar.gz; [ -d python ] && rm -rf python; wget -q -O cpython31115.tar.gz https://mirror.nju.edu.cn/github-release/astral-sh/python-build-standalone/20260303/cpython-3.11.15+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz && tar -xzf cpython31115.tar.gz && mv python runtime-env'

max_actions_per_traj: 25
env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager

agent_config_common:
agent_type: "default"
run_cmd: 'iflow -p <<PROMPT>> --yolo'
pre_init_cmds:
- command: "apt-get update"
timeout_seconds: 600
- command: "apt-get install -y curl git wget xz-utils"
timeout_seconds: 600
- command: "apt-get install -y build-essential libc6-dev patch procps"
timeout_seconds: 600
- command: "wget -q https://xrl-sandbox-bucket.oss-cn-hangzhou.aliyuncs.com/uv-files/uv-x86_64-unknown-linux-gnu.tar.gz && tar -xzf uv-x86_64-unknown-linux-gnu.tar.gz --strip-components=1 -C /usr/local/bin && uv --version"
timeout_seconds: 600
model_service_config:
type: "local"
enabled: True
runtime_env_config:
type: node
npm_registry: "https://registry.npmmirror.com"
custom_install_cmd: "wget --retry-connrefused --tries=10 --waitretry=2 -O ~/iflow-cli.tgz 'http://cloud.iflow.cn/iflow-cli/iflow-ai-iflow-cli-for-roll-0-4-4-v5.tgz' && npm i -g ~/iflow-cli.tgz"
env:
IFLOW_apiKey: "test"
IFLOW_baseUrl: "http://localhost:8080/v1"
IFLOW_modelName: "ROME"
IFLOW_searchApiKey: "88888888"
IFLOW_selectedAuthType: "openai-compatible"
IFLOW_disableAutoUpdate: "true"
IFLOW_tokensLimit: "128000"
IFLOW_shellTimeout: "360000"
IFLOW_coreTools: "Edit,exit_plan_mode,glob,list_directory,multi_edit,plan,read plan,read_file,read_many_files,save_memory,Search,Shell,task,web_fetch,web_search,write_file,xml_escape"

custom_envs:
RockTBNativeEnvTrain:
env_type: "rock_tb_native_env"
max_steps: ${max_actions_per_traj}
max_tokens_per_step: ${max_tokens_per_step}
env_manager_cls: ${env_manager_cls}
agent_system_template: "agent_system_template placeholder"
agent_template: "agent_template placeholder"
env_config:
dataset_name: /ROLL/data/swe_bench_verified_example.jsonl
tools: ~
max_steps: ${max_actions_per_traj}
mode: "train"
sandbox_base_url: http://localhost:8080
user_id: "xxx"
experiment_id: "test_tb_native"
test_files: ["/terminal-bench-datasets/datasets/swebench-verified"]
agent_config: ${agent_config_common}
swebench_native_verified:
env_type: "rock_tb_native_env"
max_steps: ${max_actions_per_traj}
max_tokens_per_step: ${max_tokens_per_step}
env_manager_cls: ${env_manager_cls}
agent_system_template: "agent_system_template placeholder"
agent_template: "agent_template placeholder"
env_config:
dataset_name: /ROLL/data/swe_bench_verified_example.jsonl
tools: ~
max_steps: ${max_actions_per_traj}
mode: "val"
sandbox_base_url: http://localhost:8080
user_id: "xxx"
experiment_id: "test_tb_native"
test_files: ["/terminal-bench-datasets/datasets/swebench-verified"]
agent_config: ${agent_config_common}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set +x

CONFIG_PATH=$(basename $(dirname $0))
export PYTHONPATH="$PWD:$PYTHONPATH"
python examples/start_agentic_pipeline.py --config_path $CONFIG_PATH --config_name agent_val_rock_swe_qwen35_2b
5 changes: 3 additions & 2 deletions roll/pipeline/agentic/env_manager/agent_native_env_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def format_messages(self, rollout_cache: RolloutCache) -> DataProto:

prompt_ids = self.tokenizer.apply_chat_template(convert_list_content_str(messages, parse_tool_call_parameter_to_dict=self.pipeline_config.parse_tool_call_parameter_to_dict),
tools=self.tools,
tokenize=True, add_generation_prompt=True, enable_thinking=False)
tokenize=True, add_generation_prompt=True, enable_thinking=False,
return_dict=False)
input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0)
attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0)
# Huggingface Transformers prefer position_ids to be 0-based.
Expand Down Expand Up @@ -518,4 +519,4 @@ def filter(self, group_id: int, episode_id: int, group: list[DataProto]):
return False

self.global_filter_stats["filtered"] += 1
return True
return True
3 changes: 2 additions & 1 deletion roll/pipeline/agentic/llm_proxy/proxy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ def generate_by_proxy(
messages,
tokenize=True,
add_generation_prompt=True,
enable_thinking=enable_thinking
enable_thinking=enable_thinking,
return_dict=False
)

# Create DataProto with tokenized input
Expand Down