alibaba · PanAndy · Mar 24, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/examples/agentic_demo/agent_val_rock_swe.yaml b/examples/agentic_demo/agent_val_rock_swe.yaml
@@ -127,8 +127,8 @@ reference:
   infer_batch_size: 1
 
 reward_normalization:
-  grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
-  method: mean
+  grouping: traj_group_id
+  method: identity
   # norm_mean_type: batch
   # norm_std_type: group
 

diff --git a/examples/agentic_demo/agent_val_rock_swe_qwen35_2b.yaml b/examples/agentic_demo/agent_val_rock_swe_qwen35_2b.yaml
@@ -0,0 +1,228 @@
+defaults:
+  - ../config/traj_envs@_here_
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+exp_name: "agentic_rollout_swe_qwen35_2b"
+seed: 42
+
+logging_dir: ./output/logs
+output_dir: ./output
+model_name: ${exp_name}-${now:%Y%m%d_%H%M%S}
+rollout_dump_dir: ./output/rollout_dump
+system_envs:
+  USE_MODELSCOPE: '1'
+
+checkpoint_config:
+  type: file_system
+  output_dir: /data
+
+num_gpus_per_node: 8
+rpc_timeout: 72000
+
+max_steps: 200
+save_steps: 50
+logging_steps: 1
+eval_steps: 0
+resume_from_checkpoint: false
+
+async_generation_ratio: 1
+# Qwen3.5 tool-call templates expect structured tool arguments.
+parse_tool_call_parameter_to_dict: true
+# Qwen3.5 chat templates do not auto-insert an empty system prompt.
+skip_mock_system_prompt: true
+
+rollout_batch_size: 4
+val_batch_size: 4
+sequence_length: 32768
+
+max_tokens_per_step: 4096
+
+advantage_clip: 0.2
+ppo_epochs: 1
+adv_estimator: "step_reinforce"
+batch_adjust_mode: "random_sample"
+step_reward_gamma: 1.0
+
+init_kl_coef: 0.0
+whiten_advantages: true
+entropy_loss_coef: 0
+max_grad_norm: 1.0
+
+# Swap these checkpoints to reuse the same setup with other Qwen3.5 dense variants.
+pretrain: Qwen/Qwen3.5-2B
+reward_pretrain: Qwen/Qwen3.5-2B
+
+actor_train:
+  model_args:
+    # fa2 doesn't work for now
+    flash_attn: sdpa
+    attn_implementation: sdpa
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+    freeze_module_prefix: vision_model
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    warmup_steps: 0
+  data_args:
+    template: qwen3_coder
+  strategy_args:
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 2
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      context_parallel_size: 2
+      sequence_parallel: true
+      use_distributed_optimizer: true
+      recompute_granularity: full
+  device_mapping: list(range(0,4))
+  infer_batch_size: 1
+
+actor_infer:
+  model_args:
+    flash_attn: sdpa
+    attn_implementation: sdpa
+    disable_gradient_checkpointing: true
+    dtype: bf16
+  generating_args:
+    max_new_tokens: ${max_tokens_per_step}
+    top_p: 1.0
+    top_k: 50
+    num_beams: 1
+    temperature: 1.0
+    num_return_sequences: 1
+    stop_strings: ["</tool_call>", "</tool_call>\n", "\n</tool_call>\n", "\n</function>"]
+    include_stop_str_in_output: true
+  data_args:
+    template: qwen3_coder
+  strategy_args:
+    strategy_name: vllm
+    strategy_config:
+      gpu_memory_utilization: 0.6
+      block_size: 16
+      load_format: auto
+      tensor_parallel_size: 1
+      max_model_len: 32768
+  device_mapping: list(range(0,8))
+
+reference:
+  model_args:
+    attn_implementation: sdpa
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    model_type: ~
+    freeze_module_prefix: vision_model
+  data_args:
+    template: qwen3_coder
+  strategy_args:
+    strategy_name: megatron_infer
+    strategy_config:
+      tensor_model_parallel_size: 2
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      context_parallel_size: 2
+  device_mapping: list(range(0,4))
+  infer_batch_size: 1
+
+reward_normalization:
+  grouping: traj_group_id
+  method: identity
+
+train_env_manager:
+  max_env_num_per_worker: 1
+  num_env_groups: 1
+  group_size: 4
+  tags: [RockTBNativeEnvTrain]
+  num_groups_partition: [1]
+  system_envs:
+    ROCK_RTENV_PYTHON_V31114_INSTALL_CMD: '[ -f cpython31115.tar.gz ] && rm cpython31115.tar.gz; [ -d python ] && rm -rf python; wget -q -O cpython31115.tar.gz https://mirror.nju.edu.cn/github-release/astral-sh/python-build-standalone/20260303/cpython-3.11.15+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz && tar -xzf cpython31115.tar.gz && mv python runtime-env'
+
+val_env_manager:
+  max_env_num_per_worker: 1
+  num_env_groups: 1
+  group_size: 4
+  tags: [swebench_native_verified]
+  num_groups_partition: [1]
+  system_envs:
+    ROCK_RTENV_PYTHON_V31114_INSTALL_CMD: '[ -f cpython31115.tar.gz ] && rm cpython31115.tar.gz; [ -d python ] && rm -rf python; wget -q -O cpython31115.tar.gz https://mirror.nju.edu.cn/github-release/astral-sh/python-build-standalone/20260303/cpython-3.11.15+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz && tar -xzf cpython31115.tar.gz && mv python runtime-env'
+
+max_actions_per_traj: 25
+env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager
+
+agent_config_common:
+  agent_type: "default"
+  run_cmd: 'iflow -p <<PROMPT>> --yolo'
+  pre_init_cmds:
+    - command: "apt-get update"
+      timeout_seconds: 600
+    - command: "apt-get install -y curl git wget xz-utils"
+      timeout_seconds: 600
+    - command: "apt-get install -y build-essential libc6-dev patch procps"
+      timeout_seconds: 600
+    - command: "wget -q https://xrl-sandbox-bucket.oss-cn-hangzhou.aliyuncs.com/uv-files/uv-x86_64-unknown-linux-gnu.tar.gz && tar -xzf uv-x86_64-unknown-linux-gnu.tar.gz --strip-components=1 -C /usr/local/bin && uv --version"
+      timeout_seconds: 600
+  model_service_config:
+    type: "local"
+    enabled: True
+  runtime_env_config:
+    type: node
+    npm_registry: "https://registry.npmmirror.com"
+    custom_install_cmd: "wget --retry-connrefused --tries=10 --waitretry=2 -O ~/iflow-cli.tgz 'http://cloud.iflow.cn/iflow-cli/iflow-ai-iflow-cli-for-roll-0-4-4-v5.tgz' && npm i -g ~/iflow-cli.tgz"
+  env:
+    IFLOW_apiKey: "test"
+    IFLOW_baseUrl: "http://localhost:8080/v1"
+    IFLOW_modelName: "ROME"
+    IFLOW_searchApiKey: "88888888"
+    IFLOW_selectedAuthType: "openai-compatible"
+    IFLOW_disableAutoUpdate: "true"
+    IFLOW_tokensLimit: "128000"
+    IFLOW_shellTimeout: "360000"
+    IFLOW_coreTools: "Edit,exit_plan_mode,glob,list_directory,multi_edit,plan,read plan,read_file,read_many_files,save_memory,Search,Shell,task,web_fetch,web_search,write_file,xml_escape"
+
+custom_envs:
+  RockTBNativeEnvTrain:
+    env_type: "rock_tb_native_env"
+    max_steps: ${max_actions_per_traj}
+    max_tokens_per_step: ${max_tokens_per_step}
+    env_manager_cls: ${env_manager_cls}
+    agent_system_template: "agent_system_template placeholder"
+    agent_template: "agent_template placeholder"
+    env_config:
+      dataset_name: /ROLL/data/swe_bench_verified_example.jsonl
+      tools: ~
+      max_steps: ${max_actions_per_traj}
+      mode: "train"
+      sandbox_base_url: http://localhost:8080
+      user_id: "xxx"
+      experiment_id: "test_tb_native"
+      test_files: ["/terminal-bench-datasets/datasets/swebench-verified"]
+      agent_config: ${agent_config_common}
+  swebench_native_verified:
+    env_type: "rock_tb_native_env"
+    max_steps: ${max_actions_per_traj}
+    max_tokens_per_step: ${max_tokens_per_step}
+    env_manager_cls: ${env_manager_cls}
+    agent_system_template: "agent_system_template placeholder"
+    agent_template: "agent_template placeholder"
+    env_config:
+      dataset_name: /ROLL/data/swe_bench_verified_example.jsonl
+      tools: ~
+      max_steps: ${max_actions_per_traj}
+      mode: "val"
+      sandbox_base_url: http://localhost:8080
+      user_id: "xxx"
+      experiment_id: "test_tb_native"
+      test_files: ["/terminal-bench-datasets/datasets/swebench-verified"]
+      agent_config: ${agent_config_common}
diff --git a/examples/agentic_demo/run_agentic_pipeline_rock_swe_qwen35_2b.sh b/examples/agentic_demo/run_agentic_pipeline_rock_swe_qwen35_2b.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set +x
+
+CONFIG_PATH=$(basename $(dirname $0))
+export PYTHONPATH="$PWD:$PYTHONPATH"
+python examples/start_agentic_pipeline.py --config_path $CONFIG_PATH --config_name agent_val_rock_swe_qwen35_2b
diff --git a/roll/pipeline/agentic/env_manager/agent_native_env_manager.py b/roll/pipeline/agentic/env_manager/agent_native_env_manager.py
@@ -202,7 +202,8 @@ def format_messages(self, rollout_cache: RolloutCache) -> DataProto:
 
         prompt_ids = self.tokenizer.apply_chat_template(convert_list_content_str(messages, parse_tool_call_parameter_to_dict=self.pipeline_config.parse_tool_call_parameter_to_dict),
                                                         tools=self.tools,
-                                                        tokenize=True, add_generation_prompt=True, enable_thinking=False)
+                                                        tokenize=True, add_generation_prompt=True, enable_thinking=False,
+                                                        return_dict=False)
         input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0)
         attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0)
         # Huggingface Transformers prefer position_ids to be 0-based.
@@ -518,4 +519,4 @@ def filter(self, group_id: int, episode_id: int, group: list[DataProto]):
             return False
 
         self.global_filter_stats["filtered"] += 1
-        return True
+        return True
diff --git a/roll/pipeline/agentic/llm_proxy/proxy_utils.py b/roll/pipeline/agentic/llm_proxy/proxy_utils.py
@@ -109,7 +109,8 @@ def generate_by_proxy(
             messages,
             tokenize=True,
             add_generation_prompt=True,
-            enable_thinking=enable_thinking
+            enable_thinking=enable_thinking,
+            return_dict=False
         )
 
         # Create DataProto with tokenized input