diff --git a/examples/agentic_demo/agent_val_rock_swe.yaml b/examples/agentic_demo/agent_val_rock_swe.yaml index 55110b3e6..71c1071e6 100644 --- a/examples/agentic_demo/agent_val_rock_swe.yaml +++ b/examples/agentic_demo/agent_val_rock_swe.yaml @@ -127,8 +127,8 @@ reference: infer_batch_size: 1 reward_normalization: - grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv - method: mean + grouping: traj_group_id + method: identity # norm_mean_type: batch # norm_std_type: group diff --git a/examples/agentic_demo/agent_val_rock_swe_qwen35_2b.yaml b/examples/agentic_demo/agent_val_rock_swe_qwen35_2b.yaml new file mode 100644 index 000000000..e814b36d4 --- /dev/null +++ b/examples/agentic_demo/agent_val_rock_swe_qwen35_2b.yaml @@ -0,0 +1,228 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_rollout_swe_qwen35_2b" +seed: 42 + +logging_dir: ./output/logs +output_dir: ./output +model_name: ${exp_name}-${now:%Y%m%d_%H%M%S} +rollout_dump_dir: ./output/rollout_dump +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: /data + +num_gpus_per_node: 8 +rpc_timeout: 72000 + +max_steps: 200 +save_steps: 50 +logging_steps: 1 +eval_steps: 0 +resume_from_checkpoint: false + +async_generation_ratio: 1 +# Qwen3.5 tool-call templates expect structured tool arguments. +parse_tool_call_parameter_to_dict: true +# Qwen3.5 chat templates do not auto-insert an empty system prompt. +skip_mock_system_prompt: true + +rollout_batch_size: 4 +val_batch_size: 4 +sequence_length: 32768 + +max_tokens_per_step: 4096 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "step_reinforce" +batch_adjust_mode: "random_sample" +step_reward_gamma: 1.0 + +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +# Swap these checkpoints to reuse the same setup with other Qwen3.5 dense variants. +pretrain: Qwen/Qwen3.5-2B +reward_pretrain: Qwen/Qwen3.5-2B + +actor_train: + model_args: + # fa2 doesn't work for now + flash_attn: sdpa + attn_implementation: sdpa + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + freeze_module_prefix: vision_model + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 4 + warmup_steps: 0 + data_args: + template: qwen3_coder + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + context_parallel_size: 2 + sequence_parallel: true + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,4)) + infer_batch_size: 1 + +actor_infer: + model_args: + flash_attn: sdpa + attn_implementation: sdpa + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${max_tokens_per_step} + top_p: 1.0 + top_k: 50 + num_beams: 1 + temperature: 1.0 + num_return_sequences: 1 + stop_strings: ["", "\n", "\n\n", "\n"] + include_stop_str_in_output: true + data_args: + template: qwen3_coder + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.6 + block_size: 16 + load_format: auto + tensor_parallel_size: 1 + max_model_len: 32768 + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: sdpa + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + freeze_module_prefix: vision_model + data_args: + template: qwen3_coder + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + context_parallel_size: 2 + device_mapping: list(range(0,4)) + infer_batch_size: 1 + +reward_normalization: + grouping: traj_group_id + method: identity + +train_env_manager: + max_env_num_per_worker: 1 + num_env_groups: 1 + group_size: 4 + tags: [RockTBNativeEnvTrain] + num_groups_partition: [1] + system_envs: + ROCK_RTENV_PYTHON_V31114_INSTALL_CMD: '[ -f cpython31115.tar.gz ] && rm cpython31115.tar.gz; [ -d python ] && rm -rf python; wget -q -O cpython31115.tar.gz https://mirror.nju.edu.cn/github-release/astral-sh/python-build-standalone/20260303/cpython-3.11.15+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz && tar -xzf cpython31115.tar.gz && mv python runtime-env' + +val_env_manager: + max_env_num_per_worker: 1 + num_env_groups: 1 + group_size: 4 + tags: [swebench_native_verified] + num_groups_partition: [1] + system_envs: + ROCK_RTENV_PYTHON_V31114_INSTALL_CMD: '[ -f cpython31115.tar.gz ] && rm cpython31115.tar.gz; [ -d python ] && rm -rf python; wget -q -O cpython31115.tar.gz https://mirror.nju.edu.cn/github-release/astral-sh/python-build-standalone/20260303/cpython-3.11.15+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz && tar -xzf cpython31115.tar.gz && mv python runtime-env' + +max_actions_per_traj: 25 +env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager + +agent_config_common: + agent_type: "default" + run_cmd: 'iflow -p <> --yolo' + pre_init_cmds: + - command: "apt-get update" + timeout_seconds: 600 + - command: "apt-get install -y curl git wget xz-utils" + timeout_seconds: 600 + - command: "apt-get install -y build-essential libc6-dev patch procps" + timeout_seconds: 600 + - command: "wget -q https://xrl-sandbox-bucket.oss-cn-hangzhou.aliyuncs.com/uv-files/uv-x86_64-unknown-linux-gnu.tar.gz && tar -xzf uv-x86_64-unknown-linux-gnu.tar.gz --strip-components=1 -C /usr/local/bin && uv --version" + timeout_seconds: 600 + model_service_config: + type: "local" + enabled: True + runtime_env_config: + type: node + npm_registry: "https://registry.npmmirror.com" + custom_install_cmd: "wget --retry-connrefused --tries=10 --waitretry=2 -O ~/iflow-cli.tgz 'http://cloud.iflow.cn/iflow-cli/iflow-ai-iflow-cli-for-roll-0-4-4-v5.tgz' && npm i -g ~/iflow-cli.tgz" + env: + IFLOW_apiKey: "test" + IFLOW_baseUrl: "http://localhost:8080/v1" + IFLOW_modelName: "ROME" + IFLOW_searchApiKey: "88888888" + IFLOW_selectedAuthType: "openai-compatible" + IFLOW_disableAutoUpdate: "true" + IFLOW_tokensLimit: "128000" + IFLOW_shellTimeout: "360000" + IFLOW_coreTools: "Edit,exit_plan_mode,glob,list_directory,multi_edit,plan,read plan,read_file,read_many_files,save_memory,Search,Shell,task,web_fetch,web_search,write_file,xml_escape" + +custom_envs: + RockTBNativeEnvTrain: + env_type: "rock_tb_native_env" + max_steps: ${max_actions_per_traj} + max_tokens_per_step: ${max_tokens_per_step} + env_manager_cls: ${env_manager_cls} + agent_system_template: "agent_system_template placeholder" + agent_template: "agent_template placeholder" + env_config: + dataset_name: /ROLL/data/swe_bench_verified_example.jsonl + tools: ~ + max_steps: ${max_actions_per_traj} + mode: "train" + sandbox_base_url: http://localhost:8080 + user_id: "xxx" + experiment_id: "test_tb_native" + test_files: ["/terminal-bench-datasets/datasets/swebench-verified"] + agent_config: ${agent_config_common} + swebench_native_verified: + env_type: "rock_tb_native_env" + max_steps: ${max_actions_per_traj} + max_tokens_per_step: ${max_tokens_per_step} + env_manager_cls: ${env_manager_cls} + agent_system_template: "agent_system_template placeholder" + agent_template: "agent_template placeholder" + env_config: + dataset_name: /ROLL/data/swe_bench_verified_example.jsonl + tools: ~ + max_steps: ${max_actions_per_traj} + mode: "val" + sandbox_base_url: http://localhost:8080 + user_id: "xxx" + experiment_id: "test_tb_native" + test_files: ["/terminal-bench-datasets/datasets/swebench-verified"] + agent_config: ${agent_config_common} diff --git a/examples/agentic_demo/run_agentic_pipeline_rock_swe_qwen35_2b.sh b/examples/agentic_demo/run_agentic_pipeline_rock_swe_qwen35_2b.sh new file mode 100755 index 000000000..00ff94001 --- /dev/null +++ b/examples/agentic_demo/run_agentic_pipeline_rock_swe_qwen35_2b.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set +x + +CONFIG_PATH=$(basename $(dirname $0)) +export PYTHONPATH="$PWD:$PYTHONPATH" +python examples/start_agentic_pipeline.py --config_path $CONFIG_PATH --config_name agent_val_rock_swe_qwen35_2b diff --git a/roll/pipeline/agentic/env_manager/agent_native_env_manager.py b/roll/pipeline/agentic/env_manager/agent_native_env_manager.py index 6d11285a6..5aff115ab 100644 --- a/roll/pipeline/agentic/env_manager/agent_native_env_manager.py +++ b/roll/pipeline/agentic/env_manager/agent_native_env_manager.py @@ -202,7 +202,8 @@ def format_messages(self, rollout_cache: RolloutCache) -> DataProto: prompt_ids = self.tokenizer.apply_chat_template(convert_list_content_str(messages, parse_tool_call_parameter_to_dict=self.pipeline_config.parse_tool_call_parameter_to_dict), tools=self.tools, - tokenize=True, add_generation_prompt=True, enable_thinking=False) + tokenize=True, add_generation_prompt=True, enable_thinking=False, + return_dict=False) input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0) attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0) # Huggingface Transformers prefer position_ids to be 0-based. @@ -518,4 +519,4 @@ def filter(self, group_id: int, episode_id: int, group: list[DataProto]): return False self.global_filter_stats["filtered"] += 1 - return True \ No newline at end of file + return True diff --git a/roll/pipeline/agentic/llm_proxy/proxy_utils.py b/roll/pipeline/agentic/llm_proxy/proxy_utils.py index 9daddcda2..3e2598f29 100644 --- a/roll/pipeline/agentic/llm_proxy/proxy_utils.py +++ b/roll/pipeline/agentic/llm_proxy/proxy_utils.py @@ -109,7 +109,8 @@ def generate_by_proxy( messages, tokenize=True, add_generation_prompt=True, - enable_thinking=enable_thinking + enable_thinking=enable_thinking, + return_dict=False ) # Create DataProto with tokenized input